import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import warnings
import seaborn as sns
import scipy.stats as st
import gc
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 100)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
path = 'traina/'
path1 = 'trainb/'
cust_info = pd.read_csv(path + '网约房平台入住人表.csv')
cust_info1 = pd.read_csv(path1 + '网约房平台入住人表.csv')
cust = pd.concat([cust_info, cust_info1])
cust.head(2)
| ORDER_PRIMARY_ID | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME |
---|
0 | 07C5BF73B18B44B0877DEED007F8771D | NaN | 19800627 | 222405 | NaN | NaN |
---|
1 | 3525D57CAE104A078E4962B2B89377B0 | 371099C301202107090001 | 19951025 | 370523 | 2.021071e+11 | 2.021071e+11 |
---|
gc.collect()
17
cust['BDATE'] = cust['BDATE'].astype('str')cust['XZQH'] = cust['XZQH'].astype('str')cust['IN_TIME'] = cust['IN_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])cust['OUT_TIME'] = cust['OUT_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
cust.head(3)
| ORDER_PRIMARY_ID | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME |
---|
0 | 07C5BF73B18B44B0877DEED007F8771D | NaN | 19800627 | 222405 | 0.0 | 0.0 |
---|
1 | 3525D57CAE104A078E4962B2B89377B0 | 371099C301202107090001 | 19951025 | 370523 | 202107092103 | 202107111158 |
---|
2 | A1414E7D71E34E1DA743AC08BCF49151 | 3710021822202008240001 | 19860311 | 370781 | 202008241401 | 202008251200 |
---|
cust.info()
<class 'pandas.core.frame.DataFrame'>Int64Index: 44030 entries, 0 to 3343Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDER_PRIMARY_ID 44030 non-null object 1 GUEST_ID 5158 non-null object 2 BDATE 44030 non-null object 3 XZQH 44030 non-null object 4 IN_TIME 44030 non-null object 5 OUT_TIME 44030 non-null objectdtypes: object(6)memory usage: 2.4+ MB
cust.nunique()
ORDER_PRIMARY_ID 43883GUEST_ID 3698BDATE 9503XZQH 2659IN_TIME 3396OUT_TIME 236dtype: int64
cust['ORDER_PRIMARY_ID'].value_counts()[10:15]
134B74870F1D4087AB3F590D2A0AAFDC 23370A2886D8F4E11860A00FEB2289E56 223D42363F202408BA04CD3D5BBAE5508 219631D36ABA448D9B0AECF0A7200C4C5 29373325C891E42CB8A8F43AF14939954 2Name: ORDER_PRIMARY_ID, dtype: int64
cust.sort_values(['ORDER_PRIMARY_ID', 'GUEST_ID'], inplace=True)
stat = cust.groupby(['ORDER_PRIMARY_ID'])['BDATE'].count().reset_index()stat.columns = ['ORDER_PRIMARY_ID', 'guest_sum']stat['guest_sum_notnull'] = cust.groupby(['ORDER_PRIMARY_ID'])['GUEST_ID'].agg('count').valuescust = cust.merge(stat, on=['ORDER_PRIMARY_ID'], how='left')
cust.drop_duplicates('ORDER_PRIMARY_ID', keep='first', inplace=True)
cust[cust['ORDER_PRIMARY_ID']=='70798782D6C04A438360D80AFE4845C1']
| ORDER_PRIMARY_ID | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull |
---|
22801 | 70798782D6C04A438360D80AFE4845C1 | 371099B389202107040001 | 19990314 | 130983 | 0.0 | 0.0 | 3 | 2 |
---|
cust[cust['ORDER_PRIMARY_ID']=='5B94671A9390437E8E79A86D4B15A076']
| ORDER_PRIMARY_ID | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull |
---|
18631 | 5B94671A9390437E8E79A86D4B15A076 | 371098A226202107010001 | 20000114 | 372324 | 0.0 | 0.0 | 2 | 1 |
---|
cust[cust['ORDER_PRIMARY_ID']=='479006B921F24969B60B68DAEDA91909']
| ORDER_PRIMARY_ID | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull |
---|
14578 | 479006B921F24969B60B68DAEDA91909 | 3710020982202010020001 | 19841013 | 210103 | 202010021601 | 202010061200 | 2 | 2 |
---|
cust[cust['ORDER_PRIMARY_ID']=='1772C1DEE3F141D8960BB9044CA4CF3A']
| ORDER_PRIMARY_ID | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull |
---|
4753 | 1772C1DEE3F141D8960BB9044CA4CF3A | 371098A783202107050001 | 19620916 | 422226 | 0.0 | 0.0 | 3 | 2 |
---|
cust[cust['ORDER_PRIMARY_ID']=='BD39A40287CE4E76B46D3BC85BFA1298']
| ORDER_PRIMARY_ID | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull |
---|
38447 | BD39A40287CE4E76B46D3BC85BFA1298 | 371099A428202105100004 | 19710219 | 370204 | 0.0 | 202105211834 | 2 | 1 |
---|
cust['IN_TIME'].max()
'202110041413'
room_info = pd.read_csv(path + '网约房注册民宿.csv')
room_info.head(1)
| CODE | HOTELID | JYMJ | ROOM_NUM | BED_NUM | FWLY | CZLY | CALLED | CITY_CODE | BUR_CODE | STA_CODE | SSX | ADDRESS | MPHM | JYQK | FIRM | DJSJ | BGSJ | STATUS | AUDITSTATUS |
---|
0 | 100177 | 3710830002 | 0.0 | 1 | 1 | 2 | 1 | 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 | 371000000000 | NaN | NaN | 371083 | 乳山市长江路 银泰海景花园 55-301 | ROOM001 | 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 | 3 | 2020-05-18 10:33:55 | 2020-07-15 10:23:58 | NaN | NaN |
---|
room_info[(room_info['CALLED']==room_info['JYQK'])==False]
| CODE | HOTELID | JYMJ | ROOM_NUM | BED_NUM | FWLY | CZLY | CALLED | CITY_CODE | BUR_CODE | STA_CODE | SSX | ADDRESS | MPHM | JYQK | FIRM | DJSJ | BGSJ | STATUS | AUDITSTATUS |
---|
room_info['CALLED'].nunique()
4232
room_info['CALLED'].value_counts()[:5]
高铁北站看海公寓 24韩乐坊/不夜城美食街/乐天世纪城 豪华大床房 景驰公寓 20国际海水浴场精致海景大床房 12九龙湾豪华一线海景loft复式 10高铁北站看海民宿 10Name: CALLED, dtype: int64
room_info['HOTELID'].nunique()
5307
room_info['CODE'].nunique()
5307
room_info.info()
<class 'pandas.core.frame.DataFrame'>RangeIndex: 5307 entries, 0 to 5306Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CODE 5307 non-null object 1 HOTELID 5307 non-null object 2 JYMJ 5307 non-null float64 3 ROOM_NUM 5307 non-null int64 4 BED_NUM 5307 non-null int64 5 FWLY 5307 non-null int64 6 CZLY 5307 non-null int64 7 CALLED 5307 non-null object 8 CITY_CODE 5307 non-null int64 9 BUR_CODE 2658 non-null float64 10 STA_CODE 2658 non-null object 11 SSX 5307 non-null int64 12 ADDRESS 5307 non-null object 13 MPHM 5307 non-null object 14 JYQK 5307 non-null object 15 FIRM 5307 non-null int64 16 DJSJ 5307 non-null object 17 BGSJ 5307 non-null object 18 STATUS 2658 non-null float64 19 AUDITSTATUS 2658 non-null float64dtypes: float64(4), int64(7), object(9)memory usage: 829.3+ KB
room_info.nunique()
CODE 5307HOTELID 5307JYMJ 173ROOM_NUM 11BED_NUM 17FWLY 1CZLY 4CALLED 4232CITY_CODE 1BUR_CODE 7STA_CODE 54SSX 5ADDRESS 4850MPHM 584JYQK 4232FIRM 2DJSJ 3753BGSJ 3294STATUS 1AUDITSTATUS 1dtype: int64
room_info.drop(['FWLY', 'CITY_CODE', 'STATUS', 'AUDITSTATUS', 'JYQK'], axis=1, inplace=True)
room_info['FIRM'].value_counts()
3 342910 1878Name: FIRM, dtype: int64
order.head(2)
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | PRE_IN_TIME | PRE_OUT_TIME | ORDER_TIME | STATUS | CANCEL_TIME | INSERT_TIME | MODIFY_TIME | FIRM |
---|
0 | 923521 | 96BBDB7CC049421C85826AE07020B139 | 278337 | 202008011200 | 202008021200 | 1.596120e+11 | 1 | NaN | 20200730224152 | 20200730224152 | 3 |
---|
1 | 923696 | C72F20539AD1447D86CD1A8E5EAEC63A | 282932 | 202008041400 | 202008061200 | 1.596121e+11 | 1 | NaN | 20200730225524 | 20200730225524 | 3 |
---|
order[order['ORDER_PRIMARY_ID']=='3CDEDB5E03534D379687645675898CA4']
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | PRE_IN_TIME | PRE_OUT_TIME | ORDER_TIME | STATUS | CANCEL_TIME | INSERT_TIME | MODIFY_TIME | FIRM |
---|
30970 | 4422027 | 3CDEDB5E03534D379687645675898CA4 | 170898 | 202108081358 | 202108101158 | 2.021071e+11 | 3 | NaN | 20210712210153 | 20210712210153 | 3 |
---|
cust[cust['ORDER_PRIMARY_ID']=='3CDEDB5E03534D379687645675898CA4']
| ORDER_PRIMARY_ID | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull |
---|
12421 | 3CDEDB5E03534D379687645675898CA4 | 371002B268202107280001 | 19871031 | 110224 | 0.0 | 202108020842 | 1 | 1 |
---|
order['PRE_IN_TIME'] = order['PRE_IN_TIME'].astype('str')order['PRE_OUT_TIME'] = order['PRE_OUT_TIME'].astype('str')
order['ORDER_TIME'] = order['ORDER_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
##########################没有考虑包含取消时间的订单
order = order[(order['CANCEL_TIME']=='nan')]
order = order.sort_values(['ORDER_ID', 'MODIFY_TIME'])
order.drop_duplicates('ORDER_ID', keep='last', inplace=True)
order.shape
(29941, 11)
order[order['ORDER_ID']==4402846]
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | PRE_IN_TIME | PRE_OUT_TIME | ORDER_TIME | STATUS | CANCEL_TIME | INSERT_TIME | MODIFY_TIME | FIRM |
---|
30141 | 4402846 | BC0E8E3602434EA6A5F29A6F6FF42233 | 100177 | 202107111358 | 202107121158 | 202107111032 | 1 | nan | 20210711140042 | 20210711140042 | 3 |
---|
order.info()
<class 'pandas.core.frame.DataFrame'>Int64Index: 29941 entries, 2 to 18590Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDER_ID 29941 non-null int64 1 ORDER_PRIMARY_ID 29941 non-null object 2 HOTELID 29941 non-null object 3 PRE_IN_TIME 29941 non-null object 4 PRE_OUT_TIME 29941 non-null object 5 ORDER_TIME 29941 non-null object 6 STATUS 29941 non-null int64 7 CANCEL_TIME 29941 non-null object 8 INSERT_TIME 29941 non-null object 9 MODIFY_TIME 29941 non-null object 10 FIRM 29941 non-null int64 dtypes: int64(3), object(8)memory usage: 2.7+ MB
order.nunique()
ORDER_ID 29941ORDER_PRIMARY_ID 29941HOTELID 4801PRE_IN_TIME 6875PRE_OUT_TIME 729ORDER_TIME 26818STATUS 3CANCEL_TIME 1INSERT_TIME 25432MODIFY_TIME 25432FIRM 2dtype: int64
order_info['FIRM'].value_counts()
3 3202910 8521Name: FIRM, dtype: int64
order['STATUS'].value_counts()
1 261372 32213 583Name: STATUS, dtype: int64
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | PRE_IN_TIME | PRE_OUT_TIME | ORDER_TIME | STATUS | CANCEL_TIME | INSERT_TIME | MODIFY_TIME | FIRM |
---|
合并数据
月份分为1、3、5、7、8、10、12:31天;2:28天;4、6、9、11:30天
df['IN_TIME'] = df['IN_TIME'].apply(lambda x: np.nan if x=='0.0' else x)df['OUT_TIME'] = df['OUT_TIME'].apply(lambda x: np.nan if x=='0.0' else x)df['IN_TIME'].fillna(df['PRE_IN_TIME'], inplace=True)df['OUT_TIME'].fillna(df['PRE_OUT_TIME'], inplace=True)
df['OUT_TIME'] = df.apply(lambda x: x['OUT_TIME'] if x['OUT_TIME'] >= x['PRE_OUT_TIME'] else x['PRE_OUT_TIME'], axis=1)df['IN_TIME'] = df.apply(lambda x: x['IN_TIME'] if x['IN_TIME'] >= x['PRE_IN_TIME'] else x['PRE_IN_TIME'], axis=1)
df.drop(['PRE_IN_TIME', 'PRE_OUT_TIME', 'CANCEL_TIME'], axis=1, inplace=True)
df.head(2)
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | ORDER_TIME | STATUS | INSERT_TIME | MODIFY_TIME | FIRM | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull |
---|
0 | 706648 | A4AACE06B518418C8A8CA1935DDD1C5A | 227185 | 202007092241 | 1 | 20200714171021 | 20200714171021 | 3 | NaN | 20010409 | 371002 | 202007151300 | 202007161200 | 1 | 0 |
---|
1 | 748647 | 67D551AACFD049AE9CC2AB65E9870678 | 9483 | 202007132109 | 1 | 20201002101040 | 20201002101040 | 3 | NaN | 19881023 | 341202 | 202010011400 | 202010021011 | 1 | 0 |
---|
df.info()
<class 'pandas.core.frame.DataFrame'>Int64Index: 29941 entries, 0 to 29940Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDER_ID 29941 non-null int64 1 ORDER_PRIMARY_ID 29941 non-null object 2 HOTELID 29941 non-null object 3 ORDER_TIME 29941 non-null object 4 STATUS 29941 non-null int64 5 INSERT_TIME 29941 non-null object 6 MODIFY_TIME 29941 non-null object 7 FIRM 29941 non-null int64 8 GUEST_ID 3561 non-null object 9 BDATE 29941 non-null object 10 XZQH 29941 non-null object 11 IN_TIME 29941 non-null object 12 OUT_TIME 29941 non-null object 13 guest_sum 29941 non-null int64 14 guest_sum_notnull 29941 non-null int64 dtypes: int64(5), object(10)memory usage: 3.7+ MB
df.nunique();
ORDER_ID 29941ORDER_PRIMARY_ID 29941HOTELID 4801ORDER_TIME 26818STATUS 3INSERT_TIME 25432MODIFY_TIME 25432FIRM 2GUEST_ID 3561BDATE 9036XZQH 2575IN_TIME 8953OUT_TIME 732guest_sum 3guest_sum_notnull 3dtype: int64
df['in_time'] = pd.to_datetime(df['IN_TIME'], errors='coerce', format='%Y%m%d%H%M')df['out_time'] = pd.to_datetime(df['OUT_TIME'], errors='coerce', format='%Y%m%d%H%M')
df = df[~df['out_time'].isnull()]
df.head(2)
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | ORDER_TIME | STATUS | INSERT_TIME | MODIFY_TIME | FIRM | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull | in_time | out_time |
---|
0 | 706648 | A4AACE06B518418C8A8CA1935DDD1C5A | 227185 | 202007092241 | 1 | 20200714171021 | 20200714171021 | 3 | NaN | 20010409 | 371002 | 202007151300 | 202007161200 | 1 | 0 | 2020-07-15 13:00:00 | 2020-07-16 12:00:00 |
---|
1 | 748647 | 67D551AACFD049AE9CC2AB65E9870678 | 9483 | 202007132109 | 1 | 20201002101040 | 20201002101040 | 3 | NaN | 19881023 | 341202 | 202010011400 | 202010021011 | 1 | 0 | 2020-10-01 14:00:00 | 2020-10-02 10:11:00 |
---|
df['in_time_year'] = df['in_time'].dt.year.fillna(0).astype('int')
df[df['day_gap']>32]
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | ORDER_TIME | STATUS | INSERT_TIME | MODIFY_TIME | FIRM | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull | in_time | out_time | in_time_year | out_time_year | in_time_month | out_time_month | in_time_day | out_time_day | in_time_hour | out_time_hour | day_gap |
---|
9154 | 4081679 | 4DCD1BDDF65B425292A7B92C8E895F0D | 315187 | 202106151752 | 3 | 20210615175403 | 20210615175403 | 3 | 371002A676202106160001 | 19890208 | 371302 | 202106161358 | 202107311158 | 1 | 1 | 2021-06-16 13:58:00 | 2021-07-31 11:58:00 | 2021 | 2021 | 6 | 7 | 16 | 31 | 13 | 11 | 45 |
---|
20135 | 4726566 | 25EF2A4A528D468BAE986CF25E88A480 | 282552 | 202108121704 | 1 | 20210813120007 | 20210813120007 | 3 | NaN | 19650327 | 230122 | 202108131158 | 202110121158 | 1 | 0 | 2021-08-13 11:58:00 | 2021-10-12 11:58:00 | 2021 | 2021 | 8 | 10 | 13 | 12 | 11 | 11 | 59 |
---|
26356 | 30851758 | 62AEF4D8DECE4715962AC0588536F3FE | B102717002 | 202012281949 | 1 | 20201228195904 | 20201228195904 | 10 | NaN | 19990312 | 411303 | 202101060000 | 202102280000 | 1 | 0 | 2021-01-06 00:00:00 | 2021-02-28 00:00:00 | 2021 | 2021 | 1 | 2 | 6 | 28 | 0 | 0 | 52 |
---|
month_ = [1, 3, 5, 7, 8, 10]df['day_gap'] = df.apply(lambda x: x['day_gap']+1 if x['in_time_month'] in month_ and x['out_time_month']>x['in_time_month'] else x['day_gap'], axis=1)
df['day_gap'].sum()
49261
#后移一天 df[‘lock_time’][0] + pd.Timedelta(days=1) #前移一天 df[‘lock_time’][0] + pd.Timedelta(days=-1) #df[‘in_time’] - timedelta(days=1) #将day_gap列为0的数据全部往后推迟一天;;;;;;;还有钟点房
dfs = []for idx, group in df.groupby(['ORDER_PRIMARY_ID']): stat = pd.DataFrame() stat['date'] = pd.date_range(start=group['IN_TIME'].values[0], end=group['OUT_TIME'].values[0], freq='D', normalize=False, closed=None) stat['ORDER_PRIMARY_ID'] = group['ORDER_PRIMARY_ID'].values[0] dfs.append(stat)
df_date = pd.concat(dfs).reset_index(drop=True)
df1 = df.merge(df_date, on=['ORDER_PRIMARY_ID'], how='left')
df1.shape
(60973, 27)
df1[(df1['out_time']!=df1['date']) & (df1['day_gap']!=0)].shape
(50058, 27)
df1[:10]
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | ORDER_TIME | STATUS | INSERT_TIME | MODIFY_TIME | FIRM | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull | in_time | out_time | in_time_year | out_time_year | in_time_month | out_time_month | in_time_day | out_time_day | in_time_hour | out_time_hour | day_gap | date |
---|
0 | 706648 | A4AACE06B518418C8A8CA1935DDD1C5A | 227185 | 202007092241 | 1 | 20200714171021 | 20200714171021 | 3 | NaN | 20010409 | 371002 | 202007151300 | 202007161200 | 1 | 0 | 2020-07-15 13:00:00 | 2020-07-16 12:00:00 | 2020 | 2020 | 7 | 7 | 15 | 16 | 13 | 12 | 1 | 2020-07-15 13:00:00 |
---|
1 | 748647 | 67D551AACFD049AE9CC2AB65E9870678 | 9483 | 202007132109 | 1 | 20201002101040 | 20201002101040 | 3 | NaN | 19881023 | 341202 | 202010011400 | 202010021011 | 1 | 0 | 2020-10-01 14:00:00 | 2020-10-02 10:11:00 | 2020 | 2020 | 10 | 10 | 1 | 2 | 14 | 10 | 1 | 2020-10-01 14:00:00 |
---|
2 | 757455 | 43F69E6DFE004103BC4741C355D2A793 | 275834 | 202007141819 | 2 | 20200714181929 | 20200714181929 | 3 | 3710021366202007150001 | 19920216 | 370784 | 202007151400 | 202007161200 | 1 | 1 | 2020-07-15 14:00:00 | 2020-07-16 12:00:00 | 2020 | 2020 | 7 | 7 | 15 | 16 | 14 | 12 | 1 | 2020-07-15 14:00:00 |
---|
3 | 757754 | 42157BB31FD340519BB6C8DD073F2FBC | 8489 | 202007141847 | 1 | 20200714184726 | 20200714184726 | 3 | NaN | 19860622 | 370102 | 202007151400 | 202007171200 | 1 | 0 | 2020-07-15 14:00:00 | 2020-07-17 12:00:00 | 2020 | 2020 | 7 | 7 | 15 | 17 | 14 | 12 | 2 | 2020-07-15 14:00:00 |
---|
4 | 757754 | 42157BB31FD340519BB6C8DD073F2FBC | 8489 | 202007141847 | 1 | 20200714184726 | 20200714184726 | 3 | NaN | 19860622 | 370102 | 202007151400 | 202007171200 | 1 | 0 | 2020-07-15 14:00:00 | 2020-07-17 12:00:00 | 2020 | 2020 | 7 | 7 | 15 | 17 | 14 | 12 | 2 | 2020-07-16 14:00:00 |
---|
5 | 757829 | 8C0C98A722FC4A3DAEFC6296AC63FF84 | 149513 | 202007141854 | 1 | 20200714185457 | 20200714185457 | 3 | NaN | 19970731 | 370983 | 202007241400 | 202007251200 | 1 | 0 | 2020-07-24 14:00:00 | 2020-07-25 12:00:00 | 2020 | 2020 | 7 | 7 | 24 | 25 | 14 | 12 | 1 | 2020-07-24 14:00:00 |
---|
6 | 758518 | 45B92315B9F44E59B383046A05429357 | 149876 | 202007142003 | 1 | 20200714200342 | 20200714200342 | 3 | NaN | 19790126 | 370105 | 202007191400 | 202007211200 | 1 | 0 | 2020-07-19 14:00:00 | 2020-07-21 12:00:00 | 2020 | 2020 | 7 | 7 | 19 | 21 | 14 | 12 | 2 | 2020-07-19 14:00:00 |
---|
7 | 758518 | 45B92315B9F44E59B383046A05429357 | 149876 | 202007142003 | 1 | 20200714200342 | 20200714200342 | 3 | NaN | 19790126 | 370105 | 202007191400 | 202007211200 | 1 | 0 | 2020-07-19 14:00:00 | 2020-07-21 12:00:00 | 2020 | 2020 | 7 | 7 | 19 | 21 | 14 | 12 | 2 | 2020-07-20 14:00:00 |
---|
8 | 759511 | CDE6BDDDE85044F7BAA53F377DFC012F | 10273 | 202007142137 | 1 | 20200714213718 | 20200714213718 | 3 | NaN | 19840919 | 370883 | 202007161400 | 202007191200 | 1 | 0 | 2020-07-16 14:00:00 | 2020-07-19 12:00:00 | 2020 | 2020 | 7 | 7 | 16 | 19 | 14 | 12 | 3 | 2020-07-16 14:00:00 |
---|
9 | 759511 | CDE6BDDDE85044F7BAA53F377DFC012F | 10273 | 202007142137 | 1 | 20200714213718 | 20200714213718 | 3 | NaN | 19840919 | 370883 | 202007161400 | 202007191200 | 1 | 0 | 2020-07-16 14:00:00 | 2020-07-19 12:00:00 | 2020 | 2020 | 7 | 7 | 16 | 19 | 14 | 12 | 3 | 2020-07-17 14:00:00 |
---|
df1[(df1['HOTELID']=='149876') & (df1['in_time_month']==9)]
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | ORDER_TIME | STATUS | INSERT_TIME | MODIFY_TIME | FIRM | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull | in_time | out_time | in_time_year | out_time_year | in_time_month | out_time_month | in_time_day | out_time_day | in_time_hour | out_time_hour | day_gap | date |
---|
df1[df1['day_gap']==0][:2]
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | ORDER_TIME | STATUS | INSERT_TIME | MODIFY_TIME | FIRM | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull | in_time | out_time | in_time_year | out_time_year | in_time_month | out_time_month | in_time_day | out_time_day | in_time_hour | out_time_hour | day_gap | date |
---|
945 | 1067205 | BD1B2089E4DA4D11A61852F393565F4B | 276992 | 202008130010 | 3 | 20200813001054 | 20200813001054 | 3 | 3710021438202008140001 | 19940402 | 410102 | 202008140817 | 202008141200 | 1 | 1 | 2020-08-14 08:17:00 | 2020-08-14 12:00:00 | 2020 | 2020 | 8 | 8 | 14 | 14 | 8 | 12 | 0 | 2020-08-14 08:17:00 |
---|
2042 | 1369323 | D063FB8024EC432B89E7A9943EE8C504 | 8845 | 202009110304 | 1 | 20200912141247 | 20200912141247 | 3 | NaN | 19881128 | 370103 | 202009121400 | 202009121413 | 1 | 0 | 2020-09-12 14:00:00 | 2020-09-12 14:13:00 | 2020 | 2020 | 9 | 9 | 12 | 12 | 14 | 14 | 0 | 2020-09-12 14:00:00 |
---|
stat = pd.DataFrame() stat[‘date’] = pd.date_range( start=group[‘IN_TIME’].values[0],#开始时间 end=group[‘OUT_TIME’].values[0],#截止时间 periods=4,#总长度 freq=‘D’,#时间间隔 tz=None,#时区 normalize=False,#是否标准化到midnight name=None,#date名称 closed=None,#首尾是否在内 **kwargs, )
stat['ORDER_PRIMARY_ID'] = group['ORDER_PRIMARY_ID'].values[0]
stat
| date |
---|
0 | 2021-06-30 13:58:00 |
---|
1 | 2021-07-01 13:58:00 |
---|
2 | 2021-07-02 13:58:00 |
---|
3 | 2021-07-03 13:58:00 |
---|
df[df['HOTELID']=='329845']
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | ORDER_TIME | STATUS | INSERT_TIME | MODIFY_TIME | FIRM | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull | in_time | out_time | in_time_year | out_time_year | in_time_month | out_time_month | in_time_day | out_time_day | in_time_hour | out_time_hour | day_gap |
---|
19861 | 4682023 | B9DF5566F1A1436793714B423FF03526 | 329845 | 202108052135 | 2 | 20210805213700 | 20210805213700 | 3 | 371002B740202108050001 | 19940626 | 230381 | 202108052321 | 202108061158 | 1 | 1 | 2021-08-05 23:02:01 | 2021-08-06 11:05:08 | 2021 | 2021 | 8 | 8 | 5 | 6 | 23 | 11 | 1 |
---|
19960 | 4698714 | CD3507B0EA2B4A95A1FF52677589FBD1 | 329845 | 202108081409 | 2 | 20210808141123 | 20210808141123 | 3 | 371002B740202108080001 | 19701008 | 230184 | 202108082122 | 202108101158 | 1 | 1 | 2021-08-08 21:02:02 | 2021-08-10 11:05:08 | 2021 | 2021 | 8 | 8 | 8 | 10 | 21 | 11 | 2 |
---|
20357 | 4754863 | 0013998FA8FB420D94A3AD0C16D46BA8 | 329845 | 202108161319 | 2 | 20210826125548 | 20210826125548 | 3 | 371002B740202108260001 | 19780323 | 149001 | 202108261805 | 202108291158 | 1 | 1 | 2021-08-26 18:00:05 | 2021-08-29 11:05:08 | 2021 | 2021 | 8 | 8 | 26 | 29 | 18 | 11 | 3 |
---|
20370 | 4756724 | 6D3512C9513145B48B35FFC5A98FAD5A | 329845 | 202108161757 | 2 | 20210816180002 | 20210816180002 | 3 | 371002B740202108160001 | 19871210 | 231222 | 202108161903 | 202108171158 | 1 | 1 | 2021-08-16 19:00:03 | 2021-08-17 11:05:08 | 2021 | 2021 | 8 | 8 | 16 | 17 | 19 | 11 | 1 |
---|
20592 | 4787023 | 757875461A7C437AAAE942F0CE4E0043 | 329845 | 202108201739 | 2 | 20210822103001 | 20210822103001 | 3 | 371002B740202108230001 | 19920605 | 430122 | 202108222108 | 202108231158 | 1 | 1 | 2021-08-22 21:00:08 | 2021-08-23 11:05:08 | 2021 | 2021 | 8 | 8 | 22 | 23 | 21 | 11 | 1 |
---|
20876 | 4818354 | 35B453B2472C4812A1332BECED86275D | 329845 | 202108241537 | 3 | 20210824154233 | 20210824154233 | 3 | 371002B740202108250001 | 19860313 | 230231 | 202108251158 | 202108261158 | 1 | 1 | 2021-08-25 11:05:08 | 2021-08-26 11:05:08 | 2021 | 2021 | 8 | 8 | 25 | 26 | 11 | 11 | 1 |
---|
21318 | 4867480 | 674C4B99BDA64E56892AAB65223C5533 | 329845 | 202108301202 | 2 | 20210830123119 | 20210830123119 | 3 | 371002B740202108300001 | 19940523 | 370687 | 202108302105 | 202108311158 | 1 | 1 | 2021-08-30 21:00:05 | 2021-08-31 11:05:08 | 2021 | 2021 | 8 | 8 | 30 | 31 | 21 | 11 | 1 |
---|
21387 | 4875628 | A2F43E8280B249E5A9BDE08F5A5F2664 | 329845 | 202108311252 | 2 | 20210831130914 | 20210831130914 | 3 | 371002B740202109010001 | 19811024 | 220381 | 202109010857 | 202109011158 | 1 | 1 | 2021-09-01 08:05:07 | 2021-09-01 11:05:08 | 2021 | 2021 | 9 | 9 | 1 | 1 | 8 | 11 | 0 |
---|
room_info[room_info['CODE']=='329845']
| CODE | HOTELID | JYMJ | ROOM_NUM | BED_NUM | CZLY | CALLED | BUR_CODE | STA_CODE | SSX | ADDRESS | MPHM | FIRM | DJSJ | BGSJ |
---|
3051 | 329845 | 371002B740 | 0.0 | 2 | 3 | 1 | 温馨精致观海房/loft名宿/孙家疃油画小镇/临近海水浴场 | 3.710020e+11 | 371000000001 | 371002 | 环翠区孙家疃望海·山公馆公寓13-16_B304室---__-~ | ROOM001 | 3 | 2021-08-03 09:58:05 | 2021-08-03 10:01:02 |
---|
df_month = df.groupby(['in_time_year', 'in_time_month'])['ORDER_ID']df_month.count()
in_time_year in_time_month2020 7 182 8 885 9 1571 10 2226 11 1179 12 22792021 1 1596 2 137 3 5 4 420 5 4893 6 3856 7 8507 8 2069 9 99 10 25 11 1Name: ORDER_ID, dtype: int64
df['IN_TIME'].min(), df['IN_TIME'].max()
('202007151300', '202111022358')
df.info()
<class 'pandas.core.frame.DataFrame'>Int64Index: 29930 entries, 0 to 29940Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDER_ID 29930 non-null int64 1 ORDER_PRIMARY_ID 29930 non-null object 2 HOTELID 29930 non-null object 3 ORDER_TIME 29930 non-null object 4 STATUS 29930 non-null int64 5 INSERT_TIME 29930 non-null object 6 MODIFY_TIME 29930 non-null object 7 FIRM 29930 non-null int64 8 GUEST_ID 3561 non-null object 9 BDATE 29930 non-null object 10 XZQH 29930 non-null object 11 IN_TIME 29930 non-null object 12 OUT_TIME 29930 non-null object 13 guest_sum 29930 non-null int64 14 guest_sum_notnull 29930 non-null int64 15 in_time 29930 non-null datetime64[ns] 16 out_time 29930 non-null datetime64[ns] 17 in_time_year 29930 non-null int32 18 out_time_year 29930 non-null int32 19 in_time_month 29930 non-null int32 20 out_time_month 29930 non-null int32 21 in_time_day 29930 non-null int32 22 out_time_day 29930 non-null int32 23 in_time_hour 29930 non-null int32 24 out_time_hour 29930 non-null int32 25 day_gap 29930 non-null int64 dtypes: datetime64[ns](2), int32(8), int64(6), object(10)memory usage: 5.3+ MB
df.nunique()
ORDER_ID 29930ORDER_PRIMARY_ID 29930HOTELID 4801ORDER_TIME 26807STATUS 3INSERT_TIME 25421MODIFY_TIME 25421FIRM 2GUEST_ID 3561BDATE 9035XZQH 2575IN_TIME 8953OUT_TIME 730guest_sum 3guest_sum_notnull 3in_time 8953out_time 730in_time_year 2out_time_year 2in_time_month 12out_time_month 12in_time_day 31out_time_day 31in_time_hour 23out_time_hour 15day_gap 33dtype: int64
df.rename(columns={'HOTELID':'CODE'}, inplace=True)
df = pd.merge(df, room_info, on=['CODE', 'FIRM'])
df.head(2)
| ORDER_ID | ORDER_PRIMARY_ID | CODE | PRE_IN_TIME | PRE_OUT_TIME | ORDER_TIME | STATUS | CANCEL_TIME | INSERT_TIME | MODIFY_TIME | FIRM | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | in_hour | in_minute | in_time | out_hour | out_minute | out_time | in_datetime | out_datetime | HOTELID | JYMJ | ROOM_NUM | BED_NUM | CZLY | CALLED | BUR_CODE | STA_CODE | SSX | ADDRESS | MPHM | DJSJ | BGSJ |
---|
0 | 923521 | 96BBDB7CC049421C85826AE07020B139 | 278337 | 202008011200 | 202008021200 | 159612011149.0 | 1 | nan | 20200730224152 | 20200730224152 | 3 | NaN | 19930804 | 320821 | 0.0 | 0.0 | 0 | .0 | 0.0 | 0 | .0 | 0.0 | NaT | NaT | 37100A1620 | 0.0 | 1 | 2 | 1 | 金海滩临海落地窗公寓,北欧式装修 | NaN | NaN | 371002 | 环翠区高区金海滩街道新浪屿花园8号楼1104 | 1104 | 2020-07-06 17:59:35 | 2020-07-15 10:24:21 |
---|
1 | 761299 | 8923C26CC35F42C9A7C0A08BCC22179C | 278337 | 202007221200 | 202007251200 | 202007150017.0 | 1 | nan | 20200715001752 | 20200715001752 | 3 | NaN | 19810628 | 120101 | 0.0 | 0.0 | 0 | .0 | 0.0 | 0 | .0 | 0.0 | NaT | NaT | 37100A1620 | 0.0 | 1 | 2 | 1 | 金海滩临海落地窗公寓,北欧式装修 | NaN | NaN | 371002 | 环翠区高区金海滩街道新浪屿花园8号楼1104 | 1104 | 2020-07-06 17:59:35 | 2020-07-15 10:24:21 |
---|
df.info()
<class 'pandas.core.frame.DataFrame'>Int64Index: 40043 entries, 0 to 40042Data columns (total 37 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDER_ID 40043 non-null int64 1 ORDER_PRIMARY_ID 40043 non-null object 2 CODE 40043 non-null object 3 PRE_IN_TIME 40043 non-null object 4 PRE_OUT_TIME 40043 non-null object 5 ORDER_TIME 40043 non-null object 6 STATUS 40043 non-null int64 7 CANCEL_TIME 40043 non-null object 8 INSERT_TIME 40043 non-null object 9 MODIFY_TIME 40043 non-null object 10 FIRM 40043 non-null int64 11 GUEST_ID 5131 non-null object 12 BDATE 40043 non-null object 13 XZQH 40043 non-null object 14 IN_TIME 40043 non-null object 15 OUT_TIME 40043 non-null object 16 in_hour 40043 non-null object 17 in_minute 40043 non-null object 18 in_time 40043 non-null object 19 out_hour 40043 non-null object 20 out_minute 40043 non-null object 21 out_time 40043 non-null object 22 in_datetime 4913 non-null datetime64[ns] 23 out_datetime 644 non-null datetime64[ns] 24 HOTELID 40043 non-null object 25 JYMJ 40043 non-null float64 26 ROOM_NUM 40043 non-null int64 27 BED_NUM 40043 non-null int64 28 CZLY 40043 non-null int64 29 CALLED 40043 non-null object 30 BUR_CODE 20204 non-null float64 31 STA_CODE 20204 non-null object 32 SSX 40043 non-null int64 33 ADDRESS 40043 non-null object 34 MPHM 40043 non-null object 35 DJSJ 40043 non-null object 36 BGSJ 40043 non-null object dtypes: datetime64[ns](2), float64(2), int64(7), object(26)memory usage: 11.6+ MB
df.nunique()
ORDER_ID 29941ORDER_PRIMARY_ID 39901CODE 4801PRE_IN_TIME 6932PRE_OUT_TIME 741ORDER_TIME 26892STATUS 3CANCEL_TIME 1INSERT_TIME 35319MODIFY_TIME 35319FIRM 2GUEST_ID 3675BDATE 9064XZQH 2578IN_TIME 3380OUT_TIME 235in_hour 24in_minute 61in_time 264out_hour 15out_minute 25out_time 211in_datetime 263out_datetime 210HOTELID 4801JYMJ 173ROOM_NUM 11BED_NUM 17CZLY 4CALLED 3813BUR_CODE 7STA_CODE 53SSX 5ADDRESS 4377MPHM 534DJSJ 3363BGSJ 2871dtype: int64
df.columns
Index(['ORDER_ID', 'ORDER_PRIMARY_ID', 'CODE', 'PRE_IN_TIME', 'PRE_OUT_TIME', 'ORDER_TIME', 'STATUS', 'CANCEL_TIME', 'INSERT_TIME', 'MODIFY_TIME', 'FIRM', 'GUEST_ID', 'BDATE', 'XZQH', 'IN_TIME', 'OUT_TIME', 'in_hour', 'in_minute', 'in_time', 'out_hour', 'out_minute', 'out_time', 'in_datetime', 'out_datetime', 'HOTELID', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'CZLY', 'CALLED', 'BUR_CODE', 'STA_CODE', 'SSX', 'ADDRESS', 'MPHM', 'DJSJ', 'BGSJ'], dtype='object')
删除HOTELID,CANCEL_TIME
df1 = df[['ORDER_ID', 'CODE', 'PRE_IN_TIME', 'PRE_OUT_TIME', 'ORDER_TIME', 'INSERT_TIME', 'MODIFY_TIME', 'IN_TIME', 'OUT_TIME', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'STATUS']]
0-撤销;1-预定;2-入住;3-离开
df1['STATUS'].value_counts()
1 344612 49213 661Name: STATUS, dtype: int64
df1.head(50)
| ORDER_ID | CODE | PRE_IN_TIME | PRE_OUT_TIME | ORDER_TIME | INSERT_TIME | MODIFY_TIME | IN_TIME | OUT_TIME | JYMJ | ROOM_NUM | BED_NUM | STATUS |
---|
0 | 923521 | 278337 | 202008011200 | 202008021200 | 159612011149.0 | 20200730224152 | 20200730224152 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
1 | 761299 | 278337 | 202007221200 | 202007251200 | 202007150017.0 | 20200715001752 | 20200715001752 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
2 | 943703 | 278337 | 202008091200 | 202008121200 | 202008012010.0 | 20200801201019 | 20200801201019 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
3 | 1012389 | 278337 | 202008141200 | 202008171200 | 202008080900.0 | 20200808090100 | 20200808090100 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
4 | 1351873 | 278337 | 202010051200 | 202010061200 | 202009091245.0 | 20200909124547 | 20200909124547 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
5 | 1481730 | 278337 | 202010051200 | 202010071200 | 202009221039.0 | 20200922103911 | 20200922103911 | 202010051624 | 202010071200 | 0.0 | 1 | 2 | 3 |
---|
6 | 1516263 | 278337 | 202010011200 | 202010041200 | 202009251502.0 | 20200925150231 | 20200925150231 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
7 | 1539785 | 278337 | 202010041200 | 202010051200 | 202009271942.0 | 20200927194258 | 20200927194258 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
8 | 1568043 | 278337 | 202010041200 | 202010051200 | 202009301837.0 | 20200930183801 | 20200930183801 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
9 | 1830343 | 278337 | 202010311200 | 202011011200 | 202010301016.0 | 20201031120256 | 20201031120256 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
10 | 1830343 | 278337 | 202010311200 | 202011011200 | 202010301016.0 | 20201030101655 | 20201030101655 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
11 | 1836522 | 278337 | 202010301200 | 202010311200 | 202010301933.0 | 20201030193317 | 20201030193317 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
12 | 1850449 | 278337 | 202011011200 | 202011021200 | 202011010951.0 | 20201101121128 | 20201101121128 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
13 | 1850449 | 278337 | 202011011200 | 202011021200 | 202011010951.0 | 20201101095111 | 20201101095111 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
14 | 2035663 | 278337 | 202011210022 | 202011211200 | 202011210022.0 | 20201121002216 | 20201121002216 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
15 | 2363298 | 278337 | 202101021200 | 202101041200 | 202012230028.0 | 20210102120800 | 20210102120800 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
16 | 2363298 | 278337 | 202101021200 | 202101041200 | 202012230028.0 | 20201223002859 | 20201223002859 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
17 | 2465555 | 278337 | 202101021200 | 202101031200 | 202101011632.0 | 20210101163250 | 20210101163250 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
18 | 2465555 | 278337 | 202101021200 | 202101031200 | 202101011632.0 | 20210102120701 | 20210102120701 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
19 | 3356024 | 278337 | 202104291158 | 202105051158 | 202104121231.0 | 20210429120002 | 20210429120002 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
20 | 3576969 | 278337 | 202105061158 | 202105091158 | 202104291925.0 | 20210506120002 | 20210506120002 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
21 | 3599404 | 278337 | 202105051158 | 202105061158 | 202105020724.0 | 20210505120008 | 20210505120008 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
22 | 3599404 | 278337 | 202105051158 | 202105061158 | 202105020724.0 | 20210502072638 | 20210502072638 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
23 | 3665000 | 278337 | 202106121158 | 202106151158 | 202105081926.0 | 20210508192825 | 20210508192825 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
24 | 3686275 | 278337 | 202105102004 | 202105111158 | 202105102004.0 | 20210510201002 | 20210510201002 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
25 | 3750952 | 278337 | 202105161158 | 202105171158 | 202105161150.0 | 20210516120228 | 20210516120228 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
26 | 3777571 | 278337 | 202105181904 | 202105191158 | 202105181904.0 | 20210518190621 | 20210518190621 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
27 | 3787561 | 278337 | 202105211158 | 202105231158 | 202105191621.0 | 20210519162332 | 20210519162332 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
28 | 3787561 | 278337 | 202105211158 | 202105231158 | 202105191621.0 | 20210521120006 | 20210521120006 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
29 | 3941122 | 278337 | 202106022128 | 202106031158 | 202106022128.0 | 20210602214001 | 20210602214001 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
30 | 3941122 | 278337 | 202106022128 | 202106031158 | 202106022128.0 | 20210602213051 | 20210602213051 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
31 | 3988599 | 278337 | 202106101158 | 202106111158 | 202106070140.0 | 20210610120016 | 20210610120016 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
32 | 3988599 | 278337 | 202106101158 | 202106111158 | 202106070140.0 | 20210607014246 | 20210607014246 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
33 | 4300717 | 278337 | 202107061158 | 202107071158 | 202107031110.0 | 20210706120003 | 20210706120003 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
34 | 4308464 | 278337 | 202107051158 | 202107061158 | 202107032053.0 | 20210703205553 | 20210703205553 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
35 | 4308464 | 278337 | 202107051158 | 202107061158 | 202107032053.0 | 20210705120003 | 20210705120003 | 0.0 | 0.0 | 0.0 | 1 | 2 | 1 |
---|
36 | 923696 | 282932 | 202008041400 | 202008061200 | 159612092325.0 | 20200730225524 | 20200730225524 | 0.0 | 0.0 | 0.0 | 2 | 3 | 1 |
---|
37 | 1065945 | 282932 | 202008131400 | 202008141200 | 202008122227.0 | 20200812222725 | 20200812222725 | 0.0 | 0.0 | 0.0 | 2 | 3 | 1 |
---|
38 | 1370082 | 282932 | 202009181400 | 202009191200 | 202009110933.0 | 20200911093350 | 20200911093350 | 0.0 | 0.0 | 0.0 | 2 | 3 | 1 |
---|
39 | 1370328 | 282932 | 202009191400 | 202009211200 | 202009111005.0 | 20200911100510 | 20200911100510 | 0.0 | 0.0 | 0.0 | 2 | 3 | 1 |
---|
40 | 1550691 | 282932 | 202010011400 | 202010021200 | 202009282156.0 | 20200928215621 | 20200928215621 | 0.0 | 0.0 | 0.0 | 2 | 3 | 1 |
---|
41 | 1612645 | 282932 | 202010081400 | 202010091200 | 202010061132.0 | 20201006113229 | 20201006113229 | 0.0 | 0.0 | 0.0 | 2 | 3 | 1 |
---|
42 | 4497012 | 282932 | 202107221358 | 202107291158 | 202107181945.0 | 20210718194727 | 20210718194727 | 0.0 | 0.0 | 0.0 | 2 | 3 | 1 |
---|
43 | 4505766 | 282932 | 202107201358 | 202107221158 | 202107191521.0 | 20210719152358 | 20210719152358 | 0.0 | 0.0 | 0.0 | 2 | 3 | 1 |
---|
44 | 4505766 | 282932 | 202107201358 | 202107221158 | 202107191521.0 | 20210720140022 | 20210720140022 | 0.0 | 0.0 | 0.0 | 2 | 3 | 1 |
---|
45 | 706648 | 227185 | 202007151300 | 202007161200 | 202007092241.0 | 20200714171021 | 20200714171021 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 |
---|
46 | 853709 | 227185 | 202007251300 | 202007261200 | 202007232243.0 | 20200723224348 | 20200723224348 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 |
---|
47 | 1008589 | 227185 | 202008251300 | 202008271200 | 202008072052.0 | 20200807205244 | 20200807205244 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 |
---|
48 | 1089505 | 227185 | 202008181300 | 202008201200 | 202008142213.0 | 20200814221323 | 20200814221323 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 |
---|
49 | 1111054 | 227185 | 202008171300 | 202008181200 | 202008162352.0 | 20200816235228 | 20200816235228 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 |
---|
df['CODE'].value_counts()[:10]
B106535897 420315790 278296093 222296537 187282520 177B105273690 142298705 124298707 122289326 116294473 116Name: CODE, dtype: int64
df['HOTELID'].value_counts()[:10]
37100A3821 420371098A808 27837100A9707 222371098B022 18737100A1898 17737100A3565 142371099A016 124371099A013 12237100A9501 11637100A8976 116Name: HOTELID, dtype: int64
df_g = df.groupby(['CODE', 'PRE_IN_TIME', 'PRE_OUT_TIME'])['ORDER_ID'].count().reset_index()
df_g.head()
| CODE | PRE_IN_TIME | PRE_OUT_TIME | ORDER_ID |
---|
0 | 100177 | 202107111358 | 202107121158 | 2 |
---|
1 | 100177 | 202107241358 | 202107261158 | 2 |
---|
2 | 100181 | 202106271358 | 202106291158 | 2 |
---|
3 | 100181 | 202107191358 | 202107201158 | 1 |
---|
4 | 100181 | 202107221358 | 202107231158 | 2 |
---|
ORDER_ID有重复,MODIFY_TIME修改过,按照最后一次的修改确定时间
df1[df1['CODE']=='100177']
| ORDER_ID | CODE | PRE_IN_TIME | PRE_OUT_TIME | ORDER_TIME | INSERT_TIME | MODIFY_TIME | IN_TIME | OUT_TIME | JYMJ | ROOM_NUM | BED_NUM | STATUS |
---|
36937 | 4402846 | 100177 | 202107111358 | 202107121158 | 202107111032.0 | 20210711103458 | 20210711103458 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 |
---|
36938 | 4402846 | 100177 | 202107111358 | 202107121158 | 202107111032.0 | 20210711140042 | 20210711140042 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 |
---|
36939 | 4455098 | 100177 | 202107241358 | 202107261158 | 202107151446.0 | 20210724140006 | 20210724140006 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 |
---|
36940 | 4455098 | 100177 | 202107241358 | 202107261158 | 202107151446.0 | 20210715144850 | 20210715144850 | 0.0 | 0.0 | 0.0 | 1 | 1 | 1 |
---|
df1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 40043 entries, 0 to 40042
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 40043 non-null int64
1 CODE 40043 non-null object
2 PRE_IN_TIME 40043 non-null object
3 PRE_OUT_TIME 40043 non-null object
4 ORDER_TIME 40043 non-null object
5 INSERT_TIME 40043 non-null object
6 MODIFY_TIME 40043 non-null object
7 IN_TIME 40043 non-null object
8 OUT_TIME 40043 non-null object
9 JYMJ 40043 non-null float64
10 ROOM_NUM 40043 non-null int64
11 BED_NUM 40043 non-null int64
12 STATUS 40043 non-null int64
dtypes: float64(1), int64(4), object(8)
memory usage: 4.3+ MB
|