1 创建读取数据
df = pd.read_csv(r"F:\gongsi\WorkList\1工作文档\时空聚类数据集\洛杉矶犯罪数据集\Crime_Data_from_2020_to_Present.csv")
df_quNAN = df[df['LAT']>0]
date_y2020 = df_quNAN[df_quNAN["DATE OCC"].str.contains("/2020")]
date_y2020 = df_quNAN[df_quNAN["DATE OCC"].str.contains("/2020")]
date_y2020month = date_y2020[date_y2020["DATE OCC"].str.startswith('01')]
kongjian_y2020month = date_y2020month.loc[:,['LAT','LON']]
kongjian_y2020month['cluster'] = cluster.labels_
kongjian_y2020month.columns = ['latitude', 'longitude']
kongjian_y2020month.index = range(len(kongjian_y2020month))
2 查看数据
(1)查看列 date_y2020.columns (2)date_y2020.infos() (3)df.describe() (4)df.head(10) (5)df.shape (6)df[‘LAT’].dtypes # 查看某一列的类型 (7)date_y2020month[‘Crm Cd’].value_counts().keys ( 8)查看值分布
kongjian_y2020month['cluster'] = cluster.labels_
kongjian_y2020month['cluster'].value_counts().values
kongjian_y2020month['cluster'].value_counts()
(9)查看每个值的分布
for item in date_y2020month['Premis Cd'].value_counts().keys():
print(item)
plt.figure(figsize=(12, 9), dpi=80)
plt.scatter(date_y2020month[date_y2020month['Premis Cd'] == item]['LON'].values.tolist(),
date_y2020month[date_y2020month['Premis Cd'] == item]['LAT'].values.tolist())
plt.show()
(10)查看以6开头的值的分布
list(date_y2020jan1[(date_y2020jan1['Crm Cd'] / 100).astype(int) == 6]['Crm Cd Desc'].values)
(11)查看每种类型的值的个数 (12)查看最大最小值唯一值
print(df["date_time"].min())
df["month"].unique()
(13)查看空值分布
kongjian_y2020jan.isnull().sum([kongjian_y2020jan.isnull().sum()>0]
(14)柱状图查看
(15)统计数据查看
3 获取指定行列数据
3.1 查询选择
https://blog.csdn.net/weixin_29696999/article/details/112663789 pandas读取行列数据的所有方法 https://www.cnblogs.com/wynlfd/p/14024947.html
一共3种: 用loc和iloc可以解决所有问题了 (1) 获取指定行名和列名用loc (2) 获取指定列的所有行用df[[‘省份’, ‘总人数’]],不推荐用它 df[‘省份’]) #按列名取列 df.省份) # 按列名取列 (3) 根据位置索引用iloc
3.2 根据条件筛选
https://www.pythonf.cn/read/157986
Pandas提取含有指定字符串的行或列 https://blog.csdn.net/nixiang_888/article/details/109371043?utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromMachineLearnPai2%7Edefault-1.control&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromMachineLearnPai2%7Edefault-1.control
3.3 数据格式转换
y2020jan['Premis Cd'] = (y2020jan['Premis Cd']/100).astype(int)
3.4 数据统计
x_label = y2020jan1[y2020jan1['cluster_num']==3].groupby('AREA').size().keys().values
y_label = y2020jan1[y2020jan1['cluster_num']==3].groupby('AREA').size().values
plt.bar(x_label,y_label)
plt.show()
4 时间数据
这个最好 https://blog.csdn.net/weixin_41261833/article/details/104839119
https://www.cnblogs.com/shao-shuai/p/9839011.html
https://www.cnblogs.com/zhangyafei/p/10513893.html
https://blog.csdn.net/weixin_39637285/article/details/111534198?utm_medium=distribute.pc_relevant.none-task-blog-2defaultbaidujs_baidulandingword~default-4.control&spm=1001.2101.3001.4242
4.1 提取年月日时间
df["date_"] = pd.to_datetime(df['date'])
print(df["date_"].value_counts().values)
df["date_time"] = pd.to_datetime(df['date'] + ' ' + df['time'])
df["date_time_beijing"] = df["date_time"] +datetime.timedelta(hours=8)
df["year"] = df['date_time_beijing'].apply(lambda x:x.year)
df["month"] = df['date_time_beijing'].apply(lambda x:x.month)
df["day"] = df['date_time_beijing'].apply(lambda x:x.day)
print("最小时间:",df["date_time_beijing"].min())
print(df["date_time_beijing"].max())
df["weekday"] = df['date_time_beijing'].apply(lambda x:x.dayofweek)
df["hour"] = df['date_time_beijing'].apply(lambda x:x.hour)
df_min = df.iloc[::12, :]
apple = apple.set_index("Date")
4.2 提取时间time,不要date
df["time"] = df['date_time_beijing'].apply(lambda x:x.time)
字符串和datetime之间的转换https://blog.csdn.net/xiaomeng29/article/details/91366762
5 数据操作(加减乘除)
归一化
max_min_scaler = lambda x : (x-numpy.min(x))/(numpy.max(x)-numpy.min(x))
kkk = y2020jan[['Vict Age']].apply(max_min_scaler)
类型转换
y2020jan['Crm Cd'] = y2020jan['Crm Cd'].astype(str)
y2020jan['Premis Cd'] = y2020jan['Premis Cd'].astype(int)
将一列数对应改成其它名称,先变成字典,再一一对应
y2020jan_result = y2020jan['Vict Descent'].unique()
y2020jan_result = y2020jan_result.tolist()
list_vict = list(range(len(y2020jan_result)))
Vict_Descent_dict = dict(zip(y2020jan_result,list_vict))
Vict_Descent_dict
y2020jan['Vict Descent'] = y2020jan['Vict Descent'].map(Vict_Descent_dict)
除以100只保留百位数字
y2020jan['Crm Cd'] = (y2020jan['Crm Cd']/100).astype(int)
y2020jan['Premis Cd'] = (y2020jan['Premis Cd']/100).astype(int)
y2020jan['Weapon Used Cd'] = (y2020jan['Weapon Used Cd']/100).astype(int)
y2020jan
6 评价指标
print("calinski_harabasz_score:",calinski_harabasz_score(kongjian_y2020month[['LON', 'LAT']],cluster.labels_,metric=customer_metric))
print("silhouette_score:", silhouette_score(kongjian_y2020month[['LAT','LON']], cluster.labels_,metric=customer_metric))
7 可视化探索
plt.figure(figsize=(12, 9), dpi=80)
plt.scatter(kongjian_y2020month['LON'].values.tolist(), kongjian_y2020month['LAT'].values.tolist(),c=cluster.labels_)
plt.title('SPACE-DBSCAN: 2020-01:eps=500,min_samples=%d' % item)
plt.xlabel('longitude(W)')
plt.ylabel('latitude(N)')
plt.savefig("F:/gongsi/WorkList/6canshu/2dbscan_min/SpaceDbscan202001min%d.png" % item, dpi=300)
plt.show()
plt.cla()
进行图例的划分
fig, ax1 = plt.subplots(figsize=(12, 9), dpi=80)
ax1.scatter(y2020jan_k0['LAT'].values.tolist(),y2020jan_k0['LON'].values.tolist(), c = 'red',label = 'class1')
ax1.scatter(y2020jan_k1['LAT'].values.tolist(),y2020jan_k1['LON'].values.tolist(), c = 'purple',label = 'class2')
ax1.scatter(y2020jan_k2['LAT'].values.tolist(),y2020jan_k2['LON'].values.tolist(), c = 'blue',label = 'class3')
ax1.scatter(y2020jan_k3['LAT'].values.tolist(),y2020jan_k3['LON'].values.tolist(), c = 'pink',label = 'class4')
ax1.scatter(y2020jan_k4['LAT'].values.tolist(),y2020jan_k4['LON'].values.tolist(), c = 'green',label = 'class5')
ax1.scatter(y2020jan_k5['LAT'].values.tolist(),y2020jan_k5['LON'].values.tolist(), c = 'yellow',label = 'class6')
plt.legend()
plt.show()
显示中文 https://blog.csdn.net/asialee_bird/article/details/81027488
from matplotlib.font_manager import FontProperties
font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=15)
fig, ax1 = plt.subplots(figsize=(12, 9), dpi=80)
ax1.scatter(zaoshengdian['LON'].values.tolist(),zaoshengdian['LAT'].values.tolist(), c = 'red',)
ax1.scatter(redian['LON'].values.tolist(),redian['LAT'].values.tolist(), c = 'purple',label = '高犯罪率地区',)
plt.legend(prop=font)
plt.title('洛杉矶2020年1月份犯罪热点区域',fontproperties=font)
plt.xlabel('longitude(W)')
plt.ylabel('latitude(N)')
plt.show()
加载地图
import folium
latitude = 34.0677
longitude = -118.2398
san_map = folium.Map(location=[latitude, longitude], zoom_start=12)
from geopy.distance import great_circle
import pandas as pd
from sklearn.cluster import DBSCAN
df = pd.read_csv(r"F:\gongsi\WorkList\1工作文档\时空聚类数据集\洛杉矶犯罪数据集\Crime_Data_from_2020_to_Present.csv")
df_quNAN = df[df['LAT']>0]
date_y2020 = df_quNAN[df_quNAN["DATE OCC"].str.contains("/2020")]
date_y2020month = date_y2020[date_y2020["DATE OCC"].str.startswith('12')]
kongjian_y2020month = date_y2020month.loc[:,['LAT','LON']]
def customer_metric(x ,y):
"""
计算空间距离
:param x: 一个点
:param y: 另一个点
:return: 返回距离
"""
return great_circle(x,y).meters
dbscan = DBSCAN(eps=500, min_samples=60, metric=customer_metric)
cluster = dbscan.fit(kongjian_y2020month[['LAT', 'LON']])
kongjian_y2020month['cluster'] = cluster.labels_
redian = kongjian_y2020month[kongjian_y2020month['cluster']!=-1]
zaoshengdian = kongjian_y2020month[kongjian_y2020month['cluster']==-1]
data = redian.iloc[:, :2]
incidents = folium.map.FeatureGroup()
for lat, lng, in zip(data.LAT, data.LON):
incidents.add_child(
folium.CircleMarker(
[lat, lng],
radius=4,
color='yellow',
fill=True,
fill_color='red',
fill_opacity=0.4
)
)
san_map = folium.Map(location=[latitude, longitude], zoom_start=12)
san_map.add_child(incidents)
8 模型构建
9 其它
9.1 自定义距离
def customer_metric(x ,y):
"""
计算空间距离
:param x: 一个点
:param y: 另一个点
:return: 返回距离
"""
return great_circle(x,y).meters
dbscan = DBSCAN(eps=500, min_samples=30, metric=customer_metric)
探索距离
from scipy.spatial.distance import pdist
x = np.array([[ 0, 2, 3, 4],
[ 2, 0, 7, 8],
[ 3, 7, 0, 12],
[ 4, 8, 12, 0]])
distance = pdist(x, metric=‘euclidean’)
low, mid, up = np.percentile(distance, [25, 50, 75])
eps = mid + 1.5*(up- low)
eps
9.2 设置参数
案例:
from geopy.distance import great_circle
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn import datasets
import numpy as np
import random
import matplotlib.pyplot as plt
import time
from sklearn.metrics import calinski_harabasz_score,silhouette_score
begin = time.time()
df = pd.read_csv(r"F:\gongsi\WorkList\1工作文档\时空聚类数据集\洛杉矶犯罪数据集\Crime_Data_from_2020_to_Present.csv")
df_quNAN = df[df['LAT']>0]
date_y2020 = df_quNAN[df_quNAN["DATE OCC"].str.contains("/2020")]
date_y2020month = date_y2020[date_y2020["DATE OCC"].str.startswith('01')]
kongjian_y2020month = date_y2020month.loc[:,['LAT','LON']]
def customer_metric(x ,y):
"""
计算空间距离
:param x: 一个点
:param y: 另一个点
:return: 返回距离
"""
return great_circle(x,y).meters
dbscan = DBSCAN(eps=500, min_samples=60, metric=customer_metric)
cluster = dbscan.fit(kongjian_y2020month[['LAT', 'LON']])
kongjian_y2020month['cluster'] = cluster.labels_
kongjian_y2020month.describe()
redian = kongjian_y2020month[kongjian_y2020month['cluster']!=-1]
zaoshengdian = kongjian_y2020month[kongjian_y2020month['cluster']==-1]
from matplotlib.font_manager import FontProperties
font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=15)
fig, ax1 = plt.subplots(figsize=(12, 9), dpi=80)
ax1.scatter(zaoshengdian['LON'].values.tolist(),zaoshengdian['LAT'].values.tolist(), c='purple')
ax1.scatter(redian['LON'].values.tolist(),redian['LAT'].values.tolist(), c = 'r',label = '高犯罪率区域',)
plt.legend(prop=font)
plt.title('洛杉矶2020年1月份犯罪热点区域',fontproperties=font)
plt.xlabel('longitude(W)')
plt.ylabel('latitude(N)')
plt.show()
|