%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import os
import re
import gc
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 100)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
path = 'traina/'
path1 = 'trainb/'
入住信息
cust_info = pd.read_csv(path + '网约房平台入住人表.csv')
cust_info1 = pd.read_csv(path1 + '网约房平台入住人表.csv')
cust = pd.concat([cust_info, cust_info1])
cust.head(2)
| ORDER_PRIMARY_ID | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME |
---|
0 | 07C5BF73B18B44B0877DEED007F8771D | NaN | 19800627 | 222405 | NaN | NaN |
---|
1 | 3525D57CAE104A078E4962B2B89377B0 | 371099C301202107090001 | 19951025 | 370523 | 2.021071e+11 | 2.021071e+11 |
---|
gc.collect()
89
cust['BDATE'] = cust['BDATE'].astype('str')
cust['XZQH'] = cust['XZQH'].astype('str')
cust['IN_TIME'] = cust['IN_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
cust['OUT_TIME'] = cust['OUT_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
cust.head(2)
| ORDER_PRIMARY_ID | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME |
---|
0 | 07C5BF73B18B44B0877DEED007F8771D | NaN | 19800627 | 222405 | 0.0 | 0.0 |
---|
1 | 3525D57CAE104A078E4962B2B89377B0 | 371099C301202107090001 | 19951025 | 370523 | 202107092103 | 202107111158 |
---|
cust.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 44030 entries, 0 to 3343
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_PRIMARY_ID 44030 non-null object
1 GUEST_ID 5158 non-null object
2 BDATE 44030 non-null object
3 XZQH 44030 non-null object
4 IN_TIME 44030 non-null object
5 OUT_TIME 44030 non-null object
dtypes: object(6)
memory usage: 2.4+ MB
cust.nunique()
ORDER_PRIMARY_ID 43883
GUEST_ID 3698
BDATE 9503
XZQH 2659
IN_TIME 3396
OUT_TIME 236
dtype: int64
cust['ORDER_PRIMARY_ID'].value_counts()[5:10]
03158D5CA62E42339D697612EA347FB3 2
0A99D74F1E0248D68A729CB79FC640E6 2
1FE3E448558347D89C56B0AFCEC8ACFB 2
864E417B7BDE4C7B9449B7E087E9F21E 2
3DEFA75566934CA5929D0C47F0B95FDE 2
Name: ORDER_PRIMARY_ID, dtype: int64
cust.sort_values(['ORDER_PRIMARY_ID', 'GUEST_ID'], inplace=True)
stat = cust.groupby(['ORDER_PRIMARY_ID'])['BDATE'].count().reset_index()
stat.columns = ['ORDER_PRIMARY_ID', 'guest_sum']
stat['guest_sum_notnull'] = cust.groupby(['ORDER_PRIMARY_ID'])['GUEST_ID'].agg('count').values
cust = cust.merge(stat, on=['ORDER_PRIMARY_ID'], how='left')
cust.drop_duplicates('ORDER_PRIMARY_ID', keep='first', inplace=True)
cust[cust['ORDER_PRIMARY_ID']=='70798782D6C04A438360D80AFE4845C1']
| ORDER_PRIMARY_ID | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull |
---|
22801 | 70798782D6C04A438360D80AFE4845C1 | 371099B389202107040001 | 19990314 | 130983 | 0.0 | 0.0 | 3 | 2 |
---|
cust['IN_TIME'].max(), cust['IN_TIME'].min()
('202110041413', '0.0')
订单信息
order_info = pd.read_csv(path + '网约平台旅客订单信息.csv')
order1 = pd.read_csv(path1 + '网约平台旅客订单信息.csv')
order = pd.concat([order_info, order1])
order.head(2)
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | PRE_IN_TIME | PRE_OUT_TIME | ORDER_TIME | STATUS | CANCEL_TIME | INSERT_TIME | MODIFY_TIME | FIRM |
---|
0 | 923521 | 96BBDB7CC049421C85826AE07020B139 | 278337 | 202008011200 | 202008021200 | 1.596120e+11 | 1 | NaN | 20200730224152 | 20200730224152 | 3 |
---|
1 | 923696 | C72F20539AD1447D86CD1A8E5EAEC63A | 282932 | 202008041400 | 202008061200 | 1.596121e+11 | 1 | NaN | 20200730225524 | 20200730225524 | 3 |
---|
order[order['ORDER_PRIMARY_ID']=='3CDEDB5E03534D379687645675898CA4']
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | PRE_IN_TIME | PRE_OUT_TIME | ORDER_TIME | STATUS | CANCEL_TIME | INSERT_TIME | MODIFY_TIME | FIRM |
---|
30970 | 4422027 | 3CDEDB5E03534D379687645675898CA4 | 170898 | 202108081358 | 202108101158 | 2.021071e+11 | 3 | NaN | 20210712210153 | 20210712210153 | 3 |
---|
order['PRE_IN_TIME'] = order['PRE_IN_TIME'].astype('str')
order['PRE_OUT_TIME'] = order['PRE_OUT_TIME'].astype('str')
order['INSERT_TIME'] = order['INSERT_TIME'].astype('str')
order['MODIFY_TIME'] = order['MODIFY_TIME'].astype('str')
order['CANCEL_TIME'] = order['CANCEL_TIME'].astype('str')
order['ORDER_TIME'] = order['ORDER_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
##########################没有考虑包含取消时间的订单
order = order[(order['CANCEL_TIME']=='nan')]
order = order.sort_values(['ORDER_ID', 'MODIFY_TIME'])
order.drop_duplicates('ORDER_ID', keep='last', inplace=True)
order.shape
(29941, 11)
order[order['ORDER_ID']==4402846]
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | PRE_IN_TIME | PRE_OUT_TIME | ORDER_TIME | STATUS | CANCEL_TIME | INSERT_TIME | MODIFY_TIME | FIRM |
---|
30141 | 4402846 | BC0E8E3602434EA6A5F29A6F6FF42233 | 100177 | 202107111358 | 202107121158 | 202107111032 | 1 | nan | 20210711140042 | 20210711140042 | 3 |
---|
order.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 29941 entries, 2 to 18590
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 29941 non-null int64
1 ORDER_PRIMARY_ID 29941 non-null object
2 HOTELID 29941 non-null object
3 PRE_IN_TIME 29941 non-null object
4 PRE_OUT_TIME 29941 non-null object
5 ORDER_TIME 29941 non-null object
6 STATUS 29941 non-null int64
7 CANCEL_TIME 29941 non-null object
8 INSERT_TIME 29941 non-null object
9 MODIFY_TIME 29941 non-null object
10 FIRM 29941 non-null int64
dtypes: int64(3), object(8)
memory usage: 2.7+ MB
order.nunique()
ORDER_ID 29941
ORDER_PRIMARY_ID 29941
HOTELID 4801
PRE_IN_TIME 6875
PRE_OUT_TIME 729
ORDER_TIME 26818
STATUS 3
CANCEL_TIME 1
INSERT_TIME 25432
MODIFY_TIME 25432
FIRM 2
dtype: int64
order_info['FIRM'].value_counts()
3 32029
10 8521
Name: FIRM, dtype: int64
order['STATUS'].value_counts()
1 26137
2 3221
3 583
Name: STATUS, dtype: int64
order[order['INSERT_TIME']!=order['MODIFY_TIME']]
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | PRE_IN_TIME | PRE_OUT_TIME | ORDER_TIME | STATUS | CANCEL_TIME | INSERT_TIME | MODIFY_TIME | FIRM |
---|
合并数据order和cust
df = pd.merge(order, cust, on='ORDER_PRIMARY_ID')
月份分为1、3、5、7、8、10、12:31天;2:28天;4、6、9、11:30天
df['IN_TIME'] = df['IN_TIME'].apply(lambda x: np.nan if x=='0.0' else x)
df['OUT_TIME'] = df['OUT_TIME'].apply(lambda x: np.nan if x=='0.0' else x)
df['IN_TIME'].fillna(df['PRE_IN_TIME'], inplace=True)
df['OUT_TIME'].fillna(df['PRE_OUT_TIME'], inplace=True)
df['OUT_TIME'] = df.apply(lambda x: x['OUT_TIME'] if x['OUT_TIME'] >= x['PRE_OUT_TIME'] else x['PRE_OUT_TIME'], axis=1)
df['IN_TIME'] = df.apply(lambda x: x['IN_TIME'] if x['IN_TIME'] >= x['PRE_IN_TIME'] else x['PRE_IN_TIME'], axis=1)
df.drop(['PRE_IN_TIME', 'PRE_OUT_TIME', 'CANCEL_TIME'], axis=1, inplace=True)
df.head(2)
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | ORDER_TIME | STATUS | INSERT_TIME | MODIFY_TIME | FIRM | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull |
---|
0 | 706648 | A4AACE06B518418C8A8CA1935DDD1C5A | 227185 | 202007092241 | 1 | 20200714171021 | 20200714171021 | 3 | NaN | 20010409 | 371002 | 202007151300 | 202007161200 | 1 | 0 |
---|
1 | 748647 | 67D551AACFD049AE9CC2AB65E9870678 | 9483 | 202007132109 | 1 | 20201002101040 | 20201002101040 | 3 | NaN | 19881023 | 341202 | 202010011400 | 202010021011 | 1 | 0 |
---|
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 29941 entries, 0 to 29940
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 29941 non-null int64
1 ORDER_PRIMARY_ID 29941 non-null object
2 HOTELID 29941 non-null object
3 ORDER_TIME 29941 non-null object
4 STATUS 29941 non-null int64
5 INSERT_TIME 29941 non-null object
6 MODIFY_TIME 29941 non-null object
7 FIRM 29941 non-null int64
8 GUEST_ID 3561 non-null object
9 BDATE 29941 non-null object
10 XZQH 29941 non-null object
11 IN_TIME 29941 non-null object
12 OUT_TIME 29941 non-null object
13 guest_sum 29941 non-null int64
14 guest_sum_notnull 29941 non-null int64
dtypes: int64(5), object(10)
memory usage: 3.7+ MB
df.nunique();
df['in_time'] = pd.to_datetime(df['IN_TIME'], errors='coerce', format='%Y%m%d%H%M')
df['out_time'] = pd.to_datetime(df['OUT_TIME'], errors='coerce', format='%Y%m%d%H%M')
df = df[~df['out_time'].isnull()]
df.head(2)
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | ORDER_TIME | STATUS | INSERT_TIME | MODIFY_TIME | FIRM | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull | in_time | out_time |
---|
0 | 706648 | A4AACE06B518418C8A8CA1935DDD1C5A | 227185 | 202007092241 | 1 | 20200714171021 | 20200714171021 | 3 | NaN | 20010409 | 371002 | 202007151300 | 202007161200 | 1 | 0 | 2020-07-15 13:00:00 | 2020-07-16 12:00:00 |
---|
1 | 748647 | 67D551AACFD049AE9CC2AB65E9870678 | 9483 | 202007132109 | 1 | 20201002101040 | 20201002101040 | 3 | NaN | 19881023 | 341202 | 202010011400 | 202010021011 | 1 | 0 | 2020-10-01 14:00:00 | 2020-10-02 10:11:00 |
---|
#后移一天 df[‘lock_time’][0] + pd.Timedelta(days=1) #前移一天 df[‘lock_time’][0] + pd.Timedelta(days=-1) #df[‘in_time’] - timedelta(days=1) #将day_gap列为0的数据全部往后推迟一天;;;;;;;还有钟点房
dfs = []
for idx, group in df.groupby(['ORDER_PRIMARY_ID']):
stat1 = pd.DataFrame()
stat1['datetime'] = pd.date_range(start=group['IN_TIME'].values[0], end=group['OUT_TIME'].values[0], freq='D', normalize=False, closed=None)
stat1['ORDER_PRIMARY_ID'] = group['ORDER_PRIMARY_ID'].values[0]
dfs.append(stat1)
df_date = pd.concat(dfs).reset_index(drop=True)
df1 = df.merge(df_date, on=['ORDER_PRIMARY_ID'], how='left')
df1.shape
(60973, 18)
df1['in_date'] = df1['in_time'].dt.date
df1['out_date'] = df1['out_time'].dt.date
df1['date'] = df1['datetime'].dt.date
df2 = df1[(df1['out_date']!=df1['date']) | ((df1['out_date']==df1['date']) & (df1['in_date']==df1['out_date']))]
df2.shape
(49547, 21)
df2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 49547 entries, 0 to 60971
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 49547 non-null int64
1 ORDER_PRIMARY_ID 49547 non-null object
2 HOTELID 49547 non-null object
3 ORDER_TIME 49547 non-null object
4 STATUS 49547 non-null int64
5 INSERT_TIME 49547 non-null object
6 MODIFY_TIME 49547 non-null object
7 FIRM 49547 non-null int64
8 GUEST_ID 5963 non-null object
9 BDATE 49547 non-null object
10 XZQH 49547 non-null object
11 IN_TIME 49547 non-null object
12 OUT_TIME 49547 non-null object
13 guest_sum 49547 non-null int64
14 guest_sum_notnull 49547 non-null int64
15 in_time 49547 non-null datetime64[ns]
16 out_time 49547 non-null datetime64[ns]
17 datetime 49547 non-null datetime64[ns]
18 in_date 49547 non-null object
19 out_date 49547 non-null object
20 date 49547 non-null object
dtypes: datetime64[ns](3), int64(5), object(13)
memory usage: 8.3+ MB
room_info = pd.read_csv(path + '网约房注册民宿.csv')
room_info.head(1)
| CODE | HOTELID | JYMJ | ROOM_NUM | BED_NUM | FWLY | CZLY | CALLED | CITY_CODE | BUR_CODE | STA_CODE | SSX | ADDRESS | MPHM | JYQK | FIRM | DJSJ | BGSJ | STATUS | AUDITSTATUS |
---|
0 | 100177 | 3710830002 | 0.0 | 1 | 1 | 2 | 1 | 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 | 371000000000 | NaN | NaN | 371083 | 乳山市长江路 银泰海景花园 55-301 | ROOM001 | 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 | 3 | 2020-05-18 10:33:55 | 2020-07-15 10:23:58 | NaN | NaN |
---|
room_info['STATUS'].value_counts()
1.0 2658
Name: STATUS, dtype: int64
room_info.drop(['HOTELID', 'FWLY', 'CITY_CODE', 'STATUS','AUDITSTATUS', 'JYQK'], axis=1, inplace=True)
room_info.rename(columns={'CODE':'HOTELID'}, inplace= True)
room_info['FIRM'].value_counts()
3 3429
10 1878
Name: FIRM, dtype: int64
room_info.nunique()
HOTELID 5307
JYMJ 173
ROOM_NUM 11
BED_NUM 17
CZLY 4
CALLED 4232
BUR_CODE 7
STA_CODE 54
SSX 5
ADDRESS 4850
MPHM 584
FIRM 2
DJSJ 3753
BGSJ 3294
dtype: int64
登记时间和变更时间无变化的只有两家
room_info['JYMJ'] = room_info['JYMJ'].apply(lambda x: np.nan if x==0 else x)
room_use_col = ['HOTELID', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'FIRM', 'STATUS']
freq_col = ['CALLED', 'ADDRESS', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'CZLY', 'BUR_CODE', 'STA_CODE', 'SSX', 'MPHM', 'FIRM']
for col in freq_col:
st = room_info[col].value_counts().reset_index()
st.columns = [col, col+'_freq']
room_info = room_info.merge(st, on=col, how='left')
room_info['room_ratio'] = room_info['JYMJ'] / room_info['ROOM_NUM']
room_info['bed_ratio'] = room_info['JYMJ'] / room_info['BED_NUM']
room_info['room_bed'] = room_info['BED_NUM'] / room_info['ROOM_NUM']
room_info['DJSJ'] = pd.to_datetime(room_info['DJSJ'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
room_info['BGSJ'] = pd.to_datetime(room_info['BGSJ'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
room_info['DJ_date'] = room_info['DJSJ'].dt.date
room_info['BG_date'] = room_info['BGSJ'].dt.date
room_info['DJ_gap'] = (room_info['BG_date'] - room_info['DJ_date']).dt.days
room_info['DJSJ'].min(), room_info['DJSJ'].max(), room_info['BGSJ'].min(), room_info['BGSJ'].max()
(Timestamp('2020-05-12 13:16:24'),
Timestamp('2021-10-19 16:05:41'),
Timestamp('2020-07-15 10:23:06'),
Timestamp('2021-10-19 16:09:15'))
plt.figure(figsize=(20, 8))
sns.countplot(room_info['JYMJ'])
<AxesSubplot:xlabel='JYMJ', ylabel='count'>
? [外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ayRq9a5M-1646146611632)(output_79_1.png)] ?
经营面积缺失超过一半,3000多,考虑删除
合并room到df中
df3 = df2.merge(room_info, on=['HOTELID', 'FIRM'], how='left')
df3 = df3.sort_values(by=['HOTELID', 'date'])
in_num = df3.groupby(['HOTELID', 'date'])['ORDER_ID'].count().reset_index()
in_num.columns = ['HOTELID', 'date', 'in_hotel_num']
df4 = df3.merge(in_num, on=['HOTELID', 'date'], how='left')
df4.head(2)
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | ORDER_TIME | STATUS | INSERT_TIME | MODIFY_TIME | FIRM | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull | in_time | out_time | datetime | in_date | out_date | date | JYMJ | ROOM_NUM | BED_NUM | CZLY | CALLED | BUR_CODE | STA_CODE | SSX | ADDRESS | MPHM | DJSJ | BGSJ | CALLED_freq | ADDRESS_freq | JYMJ_freq | ROOM_NUM_freq | BED_NUM_freq | CZLY_freq | BUR_CODE_freq | STA_CODE_freq | SSX_freq | MPHM_freq | FIRM_freq | room_ratio | bed_ratio | room_bed | DJ_date | BG_date | DJ_gap | in_hotel_num |
---|
0 | 4402846 | BC0E8E3602434EA6A5F29A6F6FF42233 | 100177 | 202107111032 | 1 | 20210711140042 | 20210711140042 | 3 | NaN | 19731004 | 370602 | 202107111358 | 202107121158 | 1 | 0 | 2021-07-11 13:58:00 | 2021-07-12 11:58:00 | 2021-07-11 13:58:00 | 2021-07-11 | 2021-07-12 | 2021-07-11 | NaN | 1 | 1 | 1 | 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 | NaN | NaN | 371083 | 乳山市长江路 银泰海景花园 55-301 | ROOM001 | 2020-05-18 10:33:55 | 2020-07-15 10:23:58 | 1 | 1 | NaN | 3091 | 1502 | 4727 | NaN | NaN | 150 | 2565 | 3429 | NaN | NaN | 1.0 | 2020-05-18 | 2020-07-15 | 58.0 | 1 |
---|
1 | 4455098 | 01D7394D02B44376913536F6071151AD | 100177 | 202107151446 | 1 | 20210724140006 | 20210724140006 | 3 | NaN | 19941001 | 429006 | 202107241358 | 202107261158 | 1 | 0 | 2021-07-24 13:58:00 | 2021-07-26 11:58:00 | 2021-07-24 13:58:00 | 2021-07-24 | 2021-07-26 | 2021-07-24 | NaN | 1 | 1 | 1 | 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 | NaN | NaN | 371083 | 乳山市长江路 银泰海景花园 55-301 | ROOM001 | 2020-05-18 10:33:55 | 2020-07-15 10:23:58 | 1 | 1 | NaN | 3091 | 1502 | 4727 | NaN | NaN | 150 | 2565 | 3429 | NaN | NaN | 1.0 | 2020-05-18 | 2020-07-15 | 58.0 | 1 |
---|
df4.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 49547 entries, 0 to 49546
Data columns (total 51 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 49547 non-null int64
1 ORDER_PRIMARY_ID 49547 non-null object
2 HOTELID 49547 non-null object
3 ORDER_TIME 49547 non-null object
4 STATUS 49547 non-null int64
5 INSERT_TIME 49547 non-null object
6 MODIFY_TIME 49547 non-null object
7 FIRM 49547 non-null int64
8 GUEST_ID 5963 non-null object
9 BDATE 49547 non-null object
10 XZQH 49547 non-null object
11 IN_TIME 49547 non-null object
12 OUT_TIME 49547 non-null object
13 guest_sum 49547 non-null int64
14 guest_sum_notnull 49547 non-null int64
15 in_time 49547 non-null datetime64[ns]
16 out_time 49547 non-null datetime64[ns]
17 datetime 49547 non-null datetime64[ns]
18 in_date 49547 non-null object
19 out_date 49547 non-null object
20 date 49547 non-null object
21 JYMJ 14248 non-null float64
22 ROOM_NUM 49547 non-null int64
23 BED_NUM 49547 non-null int64
24 CZLY 49547 non-null int64
25 CALLED 49547 non-null object
26 BUR_CODE 22916 non-null float64
27 STA_CODE 22916 non-null object
28 SSX 49547 non-null int64
29 ADDRESS 49547 non-null object
30 MPHM 49547 non-null object
31 DJSJ 49497 non-null datetime64[ns]
32 BGSJ 49547 non-null datetime64[ns]
33 CALLED_freq 49547 non-null int64
34 ADDRESS_freq 49547 non-null int64
35 JYMJ_freq 14248 non-null float64
36 ROOM_NUM_freq 49547 non-null int64
37 BED_NUM_freq 49547 non-null int64
38 CZLY_freq 49547 non-null int64
39 BUR_CODE_freq 22916 non-null float64
40 STA_CODE_freq 22916 non-null float64
41 SSX_freq 49547 non-null int64
42 MPHM_freq 49547 non-null int64
43 FIRM_freq 49547 non-null int64
44 room_ratio 14248 non-null float64
45 bed_ratio 14248 non-null float64
46 room_bed 49547 non-null float64
47 DJ_date 49497 non-null object
48 BG_date 49547 non-null object
49 DJ_gap 49497 non-null float64
50 in_hotel_num 49547 non-null int64
dtypes: datetime64[ns](5), float64(9), int64(18), object(19)
memory usage: 19.7+ MB
df4['date'].min(), df4['date'].max()
(datetime.date(2020, 7, 15), datetime.date(2021, 11, 2))
df4['in_year'] = df4['datetime'].dt.year.fillna(0).astype('int')
df4['in_month'] = df4['datetime'].dt.month.fillna(0).astype('int')
df4['in_day'] = df4['datetime'].dt.day.fillna(0).astype('int')
df4['in_quarter'] = df4['datetime'].dt.quarter.fillna(0).astype('int')
df4['in_dayofweek'] = df4['datetime'].dt.dayofweek.fillna(0).astype('int')
df4['in_dayofyear'] = df4['datetime'].dt.dayofyear.fillna(0).astype('int')
df4['in_is_wknd'] = df4['datetime'].dt.dayofweek // 5
df4[:2]
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | ORDER_TIME | STATUS | INSERT_TIME | MODIFY_TIME | FIRM | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull | in_time | out_time | datetime | in_date | out_date | date | JYMJ | ROOM_NUM | BED_NUM | CZLY | CALLED | BUR_CODE | STA_CODE | SSX | ADDRESS | MPHM | DJSJ | BGSJ | CALLED_freq | ADDRESS_freq | JYMJ_freq | ROOM_NUM_freq | BED_NUM_freq | CZLY_freq | BUR_CODE_freq | STA_CODE_freq | SSX_freq | MPHM_freq | FIRM_freq | room_ratio | bed_ratio | room_bed | DJ_date | BG_date | DJ_gap | in_hotel_num | in_year | in_month | in_day | in_quarter | in_dayofweek | in_dayofyear | record_is_wknd |
---|
0 | 4402846 | BC0E8E3602434EA6A5F29A6F6FF42233 | 100177 | 202107111032 | 1 | 20210711140042 | 20210711140042 | 3 | NaN | 19731004 | 370602 | 202107111358 | 202107121158 | 1 | 0 | 2021-07-11 13:58:00 | 2021-07-12 11:58:00 | 2021-07-11 13:58:00 | 2021-07-11 | 2021-07-12 | 2021-07-11 | NaN | 1 | 1 | 1 | 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 | NaN | NaN | 371083 | 乳山市长江路 银泰海景花园 55-301 | ROOM001 | 2020-05-18 10:33:55 | 2020-07-15 10:23:58 | 1 | 1 | NaN | 3091 | 1502 | 4727 | NaN | NaN | 150 | 2565 | 3429 | NaN | NaN | 1.0 | 2020-05-18 | 2020-07-15 | 58.0 | 1 | 2021 | 7 | 11 | 3 | 6 | 192 | 1 |
---|
1 | 4455098 | 01D7394D02B44376913536F6071151AD | 100177 | 202107151446 | 1 | 20210724140006 | 20210724140006 | 3 | NaN | 19941001 | 429006 | 202107241358 | 202107261158 | 1 | 0 | 2021-07-24 13:58:00 | 2021-07-26 11:58:00 | 2021-07-24 13:58:00 | 2021-07-24 | 2021-07-26 | 2021-07-24 | NaN | 1 | 1 | 1 | 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 | NaN | NaN | 371083 | 乳山市长江路 银泰海景花园 55-301 | ROOM001 | 2020-05-18 10:33:55 | 2020-07-15 10:23:58 | 1 | 1 | NaN | 3091 | 1502 | 4727 | NaN | NaN | 150 | 2565 | 3429 | NaN | NaN | 1.0 | 2020-05-18 | 2020-07-15 | 58.0 | 1 | 2021 | 7 | 24 | 3 | 5 | 205 | 1 |
---|
df4['date1'] = df4['date'].astype('str')
use_month = [5, 6, 7, 8, 9]
df_1 = df4[(df4['in_is_wknd']==1) & (df4['in_month'].isin(use_month))]
df_1.shape
(11583, 59)
df_1['in_month'].value_counts()
7 4391
5 2520
6 2192
8 1656
9 824
Name: in_month, dtype: int64
holiday = ['2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04', '2021-05-05', '2021-06-12', '2021-06-13', '2021-06-14',
'2021-09-19', '2021-09-20', '2021-09-21']
df4[df4['date']=='2021-05-03']
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | ORDER_TIME | STATUS | INSERT_TIME | MODIFY_TIME | FIRM | GUEST_ID | BDATE | XZQH | IN_TIME | OUT_TIME | guest_sum | guest_sum_notnull | in_time | out_time | datetime | in_date | out_date | date | JYMJ | ROOM_NUM | BED_NUM | CZLY | CALLED | BUR_CODE | STA_CODE | SSX | ADDRESS | MPHM | DJSJ | BGSJ | CALLED_freq | ADDRESS_freq | JYMJ_freq | ROOM_NUM_freq | BED_NUM_freq | CZLY_freq | BUR_CODE_freq | STA_CODE_freq | SSX_freq | MPHM_freq | FIRM_freq | room_ratio | bed_ratio | room_bed | DJ_date | BG_date | DJ_gap | in_hotel_num | in_year | in_month | in_day | in_quarter | in_dayofweek | in_dayofyear | record_is_wknd | in_is_wknd |
---|
df_2 = df4[df4['date1'].isin(holiday)]
df_2['holiday'] = 1
df_1['holiday'] = 0
df5 = pd.concat([df_2, df_1])
df5.shape
(14662, 61)
df5.drop(['CALLED', 'ADDRESS', 'INSERT_TIME', 'MODIFY_TIME', 'IN_TIME', 'OUT_TIME', 'guest_sum_notnull'], axis=1, inplace=True)
df5[:3]
| ORDER_ID | ORDER_PRIMARY_ID | HOTELID | ORDER_TIME | STATUS | FIRM | GUEST_ID | BDATE | XZQH | guest_sum | in_time | out_time | datetime | in_date | out_date | date | JYMJ | ROOM_NUM | BED_NUM | CZLY | BUR_CODE | STA_CODE | SSX | MPHM | DJSJ | BGSJ | CALLED_freq | ADDRESS_freq | JYMJ_freq | ROOM_NUM_freq | BED_NUM_freq | CZLY_freq | BUR_CODE_freq | STA_CODE_freq | SSX_freq | MPHM_freq | FIRM_freq | room_ratio | bed_ratio | room_bed | DJ_date | BG_date | DJ_gap | in_hotel_num | in_year | in_month | in_day | in_quarter | in_dayofweek | in_dayofyear | record_is_wknd | in_is_wknd | date1 | holiday |
---|
9 | 3597092 | CFD17F6BB142485DB3DEC6B5E2D0B664 | 100193 | 202105012046 | 1 | 3 | NaN | 19771008 | 132826 | 1 | 2021-05-01 20:46:00 | 2021-05-02 11:58:00 | 2021-05-01 20:46:00 | 2021-05-01 | 2021-05-02 | 2021-05-01 | NaN | 2 | 2 | 1 | NaN | NaN | 371083 | ROOM001 | 2020-05-18 10:33:55 | 2020-07-15 10:23:58 | 1 | 1 | NaN | 1249 | 2281 | 4727 | NaN | NaN | 150 | 2565 | 3429 | NaN | NaN | 1.0 | 2020-05-18 | 2020-07-15 | 58.0 | 1 | 2021 | 5 | 1 | 2 | 5 | 121 | 1 | 1 | 2021-05-01 | 1 |
---|
11 | 3601837 | 5C96E986C1F04691A3EB07962BCC7B00 | 100195 | 202105021435 | 1 | 3 | NaN | 19951009 | 370203 | 1 | 2021-05-02 14:35:00 | 2021-05-03 11:58:00 | 2021-05-02 14:35:00 | 2021-05-02 | 2021-05-03 | 2021-05-02 | NaN | 2 | 3 | 1 | NaN | NaN | 371083 | ROOM001 | 2020-05-18 10:33:55 | 2020-07-15 10:23:58 | 1 | 1 | NaN | 1249 | 895 | 4727 | NaN | NaN | 150 | 2565 | 3429 | NaN | NaN | 1.5 | 2020-05-18 | 2020-07-15 | 58.0 | 1 | 2021 | 5 | 2 | 2 | 6 | 122 | 1 | 1 | 2021-05-02 | 1 |
---|
53 | 3314117 | C0C5516462D3487C895DD654B836F855 | 10037 | 202104082130 | 1 | 3 | NaN | 19730313 | 120103 | 1 | 2021-04-30 13:58:00 | 2021-05-03 11:58:00 | 2021-05-01 13:58:00 | 2021-04-30 | 2021-05-03 | 2021-05-01 | NaN | 1 | 2 | 1 | NaN | NaN | 371002 | ROOM001 | 2020-05-18 10:22:13 | 2020-07-15 10:23:58 | 2 | 2 | NaN | 3091 | 2281 | 4727 | NaN | NaN | 4661 | 2565 | 3429 | NaN | NaN | 2.0 | 2020-05-18 | 2020-07-15 | 58.0 | 1 | 2021 | 5 | 1 | 2 | 5 | 121 | 1 | 1 | 2021-05-01 | 1 |
---|
drop_col = df5.columns.tolist()
drop_col.remove('holiday')
df5.drop_duplicates(drop_col, keep='first', inplace=True)
df5.shape
(12708, 54)
dd = df_1.groupby(['in_year', 'in_month'])['ORDER_ID'].count().reset_index()
dd
| in_year | in_month | ORDER_ID | ym |
---|
0 | 2020 | 7 | 60 | 202007 |
---|
1 | 2020 | 8 | 511 | 202008 |
---|
2 | 2020 | 9 | 680 | 202009 |
---|
3 | 2020 | 10 | 1487 | 202010 |
---|
4 | 2020 | 11 | 665 | 202011 |
---|
5 | 2020 | 12 | 904 | 202012 |
---|
6 | 2021 | 1 | 999 | 202101 |
---|
7 | 2021 | 2 | 71 | 202102 |
---|
8 | 2021 | 3 | 2 | 202103 |
---|
9 | 2021 | 4 | 4 | 202104 |
---|
10 | 2021 | 5 | 2520 | 202105 |
---|
11 | 2021 | 6 | 2192 | 202106 |
---|
12 | 2021 | 7 | 4331 | 202107 |
---|
13 | 2021 | 8 | 1145 | 202108 |
---|
14 | 2021 | 9 | 144 | 202109 |
---|
15 | 2021 | 10 | 40 | 202110 |
---|
dd['ym'] = dd['in_year']*100+dd['in_month']
dd['ym'].value_counts()
202106 1
202008 1
202110 1
202109 1
202012 1
202011 1
202010 1
202009 1
202007 1
202104 1
202102 1
202101 1
202103 1
202108 1
202107 1
202105 1
202111 1
Name: ym, dtype: int64
plt.figure(figsize=(15,8))
plt.scatter(dd.index, dd['ORDER_ID'])
<matplotlib.collections.PathCollection at 0x13562173a00>
? [外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-EHvuTP3g-1646146611636)(output_112_1.png)] ?
|