数据预处理

1.通过爬虫爬取链家的新房数据，并进行预处理。

1.1 分析过程

1.1.1 爬取

? 首先进行数据的爬取，在爬取过程中，出现并解决了以下问题：

? 问题1：数据仅含有总价而不含有均价，且总价出现在均价的位置上

? 解决方案：判断均价位置数据内是否含有’-'符号，若含有则为总价数据，将其内容赋给总价，并将均价置空。

? 问题2：爬取数据不完整

? 解决方案：通过检查源网页，发现共有180条数据，但实际爬取到的数据少于这个数目。经过检查发现因为源网页存在一些数据项为空，比如没有房型信息等。对于这个问题，进行判断检查，对于空值额外处理即可。

? 问题3：反爬机制封禁ip

? 解决方案：在爬取过程中，经常遇到同样的代码时而成功爬取，时而爬取结果不完整或失败。对于这个问题，只需要在问题出现时，进入浏览器完成网页的人机验证即可解封被封禁的ip，完成后续的爬取。

1.1.2 数据预处理

? 完成爬取后，需要进行数据的预处理，分别进行去空格、设置数据类型、求总价相关信息和单价相关信息即可。

1.2 核心代码

1.2.1 spider.py

import scrapy
from test1.items import MyItem #从items.py中引入MyItem对象
class mySpider(scrapy.spiders.Spider):
    name = "lianjia" #爬虫的名字是lianjia
    allowed_domains = ["bj.lianjia.com/"] #允许爬取的网站域名
    start_urls = []

    for pg in range(1,23):
        url = "https://bj.fang.lianjia.com/loupan/pg{}/".format(pg)
        start_urls.append(url)

    def parse(self, response): #解析爬取的内容
        item = MyItem() #生成一个在items.py中定义好的Myitem对象,用于接收爬取的数据
        for each in response.xpath("/html/body/div[3]/ul[2]/*"):
            #用xpath来解析html，div标签中的数据，就是我们需要的数据。
            n = each.xpath("div/div[1]/a/text()").extract()[0]#/html/body/div[3]/ul[2]/li[1]/div/div[1]/a
            if len(n) == 0:  # 最小户型的数据可能不存在，进行判断，如果不存在，那么赋值为''
                item['name'] = ''
            else:
                item['name'] = n

            p1 = each.xpath("div/div[2]/span[1]/text()").extract()[0]
            if len(p1) == 0:  # 最小户型的数据可能不存在，进行判断，如果不存在，那么赋值为''
                item['location1'] = ''
            else:
                item['location1'] = p1

            p2 = each.xpath("div/div[2]/span[2]/text()").extract()[0]
            if len(p2) == 0:  # 最小户型的数据可能不存在，进行判断，如果不存在，那么赋值为''
                item['location2'] = ''
            else:
                item['location2'] = p2

            p3 = each.xpath("div/div[2]/a/text()").extract()[0]#/html/body/div[3]/ul[2]/li[1]/div/div[2]/a
            if len(p3) == 0:  # 最小户型的数据可能不存在，进行判断，如果不存在，那么赋值为''
                item['location3'] = ''
            else:
                item['location3'] = p3

            l0 = each.xpath("div/a/span[1]/text()").extract()#/html/body/div[3]/ul[2]/li[4]/div/a/span[1]
            if len(l0) == 0:  # 最小户型的数据可能不存在，进行判断，如果不存在，那么赋值为''
                item['shape'] = ''
            else:
                item['shape'] = l0[0]

            l1 = each.xpath("div/div[3]/span/text()").extract()#/html/body/div[3]/ul[2]/li[1]/div/div[3]/span
            if len(l1):  # 最小面积的数据存在时，进行提取最小值
                str = l1[0]
                startpos = str.find(" ") + 1
                endpos = str.find("-")
                if endpos == -1:
                    endpos = str.find("m")
                item['area'] = str[startpos: endpos]
            else:  # 最小面积不存在时，赋值为空串''
                item['area'] = ''

            l2  = each.xpath("div/div[6]/div[2]/text()").extract()#/html/body/div[3]/ul[2]/li[1]/div/div[6]/div[2]
            if len(l2): 
                totalPrice = ''
                for i in l2[0]:
                    if i.isdigit():
                        totalPrice += i
                    if i == "-":
                        break
                item['totalPrice'] = totalPrice
            else:
                item['totalPrice'] = ''

            l3 = each.xpath("div/div[6]/div[1]/span[1]/text()").extract()[0]
            endpos = l3.find("-")
            if endpos != -1:
                item['totalPrice'] = l3[:endpos]
                item['unitPrice'] = ''
            elif len(l3) == 0:
                 item['unitPrice'] = ''
            else:
                 item['unitPrice'] = l3

            if(item['name']): #去掉值为空的数据
                yield(item) #返回item数据给到pipelines模块

1.2.2 items.py

import scrapy

class MyItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field() #名称
    location1 = scrapy.Field()#地理位置
    location2 = scrapy.Field()
    location3 = scrapy.Field()
    shape = scrapy.Field()#房型
    area = scrapy.Field()#面积
    totalPrice = scrapy.Field()#总价
    unitPrice = scrapy.Field()#单价

1.2.3 pipelines.py

import csv
class MyPipeline:
    def open_spider(self, spider):
        try:
            self.file = open('lianjia.csv', 'w', newline='')
            self.csv = csv.writer(self.file)
        except Exception as err:
            print(err)

    def process_item(self, item, spider):
        self.csv.writerow(list(item.values()))
        return item

    def close_spider(self, spider):
        self.file.close()

1.2.3 lianjia_data_process.py

import pandas as pd

# 打开CSV文件
fileNameStr = 'lianjia.csv'
orig_df = pd.read_csv(fileNameStr, names=['name','location1','location2','location3','shape','area','totalPrice','unitPrice'],encoding='gbk', dtype=str)
print(orig_df.describe())

# 1.将字符串的列前后空格去掉
orig_df['name'] = orig_df['name'].str.strip()
orig_df['location1'] = orig_df['location1'].str.strip()
orig_df['location2'] = orig_df['location2'].str.strip()
orig_df['location3'] = orig_df['location3'].str.strip()
orig_df['shape'] = orig_df['shape'].str.strip()
orig_df['area'] = orig_df['area'].str.strip()
orig_df['totalPrice'] = orig_df['totalPrice'].str.strip()
orig_df['unitPrice'] = orig_df['unitPrice'].str.strip()

# 2.将aera,unitPrice,totalPrice变为浮点型
orig_df['area'] = orig_df['area'].astype(float)
orig_df['unitPrice'] = orig_df['unitPrice'].astype(float)
orig_df['totalPrice'] = orig_df['totalPrice'].astype(float)

# 3.总价
# 最大值
print("总价：")
imaxpos = orig_df['totalPrice'].idxmax()
print("最贵", orig_df.loc[imaxpos, "totalPrice"], orig_df.loc[imaxpos, "name"])
# 最小值
iminpos = orig_df['totalPrice'].idxmin()
print("最便宜", orig_df.loc[iminpos, "totalPrice"], orig_df.loc[iminpos, "name"])
# 中位数
print("中位数", orig_df['totalPrice'].median())

# 4.单价
# 最大值
print("单价：")
idmaxpos = orig_df['unitPrice'].idxmax()
print("最贵", orig_df.loc[idmaxpos, "unitPrice"], orig_df.loc[idmaxpos, "name"])
# 最小值
idminpos = orig_df['unitPrice'].idxmin()
print("最便宜", orig_df.loc[idminpos, "unitPrice"], orig_df.loc[idminpos, "name"])
# 中位数
print("中位数", orig_df['unitPrice'].median())

orig_df.to_csv("NewLianjia.csv", header=True, encoding="gbk", mode='w+', index=False,float_format='%.4f')

1.3 最终结果及分析结论

程序的输出如下：
在这里插入图片描述
csv前50条数据:

name	location1	location2	location3	shape	area	totalPrice	unitPrice
京贸国际城·峰景	通州	武夷花园	芙蓉东路1号（通燕高速耿庄桥北出口向南300米）	1室	69	540	68000.0000
观唐云鼎	密云	溪翁庄镇	溪翁庄镇密溪路39号院（云佛山度假村对面）	3室	357	1068	30000.0000
旭辉城	房山	房山其它	北京市房山区良锦街6号院旭辉城营销中心	2室	75	219	28500.0000
檀香府	门头沟	门头沟其它	京潭大街与潭柘十街交叉口	3室	124	530	43000.0000
中海丽春湖墅·合院	昌平	沙河	地铁昌平线沙河地铁站南600米	4室	263	1000	36000.0000
泰禾金府大院	丰台	西红门	南四环地铁新宫站南800米	4室	362	2600	75000.0000
电建地产洺悦苑	丰台	马家堡	南四环中路115号	3室	89	830	63000.0000
金樾和著	房山	房山其它	房山区良常路官道路口西800米	3室	89	300	34000.0000
和棠瑞著	平谷	平谷其它	金海湖景区坝前广场西侧500米	3室	305	530	19000.0000
尊悦光华	朝阳	CBD	北京市朝阳区光华东里甲1号院3号楼	3室	133	2600	130000.0000
北京城建北京合院	顺义	顺义其它	燕京街与通顺路交汇口东800米(仁和公园南)	3室	95	520	47000.0000
珠光御景西园	丰台	丰台其它	北京市丰台区长辛店长云路2号珠江御景营销中心	3室	117	450	39000.0000
北京城建北京合院	顺义	顺义其它	燕京街与通顺路交汇口东800米(仁和公园南)	4室	210	850	45000.0000
金隅花石匠	通州	临河里	砖厂北里链家门店	2室	88	605	33000.0000
国锐金嵿	亦庄开发区	亦庄	荣华南路1号院	5室	285	2200	80000.0000
首开香溪郡	通州	通州其它	宋庄镇荷香街2号院	5室	90	650	35000.0000
天润福熙大道	朝阳	北苑	清河营东路1号院, 清河营东路3号院	6室	436	4542	110000.0000
元熙华府	丰台	宋家庄	东南三环东铁营桥向南600米	3室	126	1113	87000.0000
京贸国际公馆	通州	九棵树(家乐福)	怡乐中路299号院（广渠快速路二期出口向南1000米）	1室	72	490	64000.0000
凯德麓语	昌平	昌平其它	兴寿镇京承高速G11出口向西怀昌路北侧	3室	280	850	35000.0000
新潮嘉园二期	通州	潞苑	潞苑南大街185号	1室	65	384	58000.0000
长海御墅	房山	房山其它	长沟国家湿地公园南侧	3室	224	420	23000.0000
中海云筑	大兴	大兴新机场	北京市大兴区团结路	3室	89	340	37000.0000
中海云筑	大兴	大兴新机场	京开高速庞各庄桥西1500米，团结路北（庞各庄镇宏轩饺子馆儿对面）	3室	266	900	37000.0000
远洋五里春秋	石景山	石景山其它	五里坨黑石头路前行500m	3室	290	1000	52024.0000
首创·河著	顺义	顺义其它	京承高速11出口（昌金路）向东900 米路北	4室	248	1200	41000.0000
华萃西山	门头沟	门头沟其它	永定镇地铁S1号线石厂西南700米	4室	83	666	47000.0000
住总正华·时代广场	大兴	天宫院	地铁4号线生物医药基地站南200米	0室	61	404	55000.0000
京西悦府	房山	阎村	北京市房山区燕房线阎村地铁站东南角约189米	3室	175	800	50000.0000
中粮天恒天悦壹号	丰台	西红门	南四环地铁新宫站南500米	4室	220	1900	85000.0000
中海云熙	大兴	大兴其它	魏善庄新城南中轴路东侧500米	2室	76	309	37700.0000
和光悦府	朝阳	朝阳其它	南皋路和光悦府	4室	120	1060	88000.0000
棠颂璟庐	亦庄开发区	亦庄开发区其它	鹿华路7号院（南海子公园北侧500米）	4室	250	1900	75000.0000
金隅上城郡	昌平	北七家	北亚花园东路50米	4室	212	930	45000.0000
万科弗农小镇	密云	溪翁庄镇	密关路西侧（密云水库南岸2公里）	3室	140	350	25000.0000
首开保利欢乐大都汇	门头沟	冯村	石门营环岛北50米		500	3250	65000.0000
中铁华侨城和园	大兴	瀛海	南五环南海子公园西侧约500米	3室	154	950	60000.0000
顺鑫颐和天璟	顺义	顺义其它	新城右堤路与昌金路交汇处向北200米	3室	110	360	33000.0000
誉天下盛寓	顺义	中央别墅区	中央别墅区榆阳路与林荫路交叉口	3室	120	720	60000.0000
泰禾金府大院	丰台	西红门	南四环地铁新宫站南800米	2室	175	1700	82000.0000
奥园雲水院	密云	溪翁庄镇	密云区Y753(走石路)	3室	111	250	22000.0000
国瑞熙墅	昌平	北七家	北七家镇岭上西路与定泗路交汇处东南角	3室	314	1500	48000.0000
中粮京西祥云	房山	长阳	地铁稻田站北800米，西邻京深路	4室	115	800	58000.0000
水岸壹号	房山	良乡	良乡大学城西站地铁南侧800米，刺猬河旁	3室	185	1100	58000.0000
观唐云鼎	密云	溪翁庄镇	溪翁庄镇密溪路39号院（云佛山度假村对面）	3室	172	499	30000.0000
运河铭著	通州	北关	商通大道与榆东一街交叉口，温榆河森林公园东500米	2室	100	490	49000.0000
万年广阳郡九号	房山	长阳	长阳清苑南街与汇商东路交汇处西北角	3室	166	830	50000.0000
首开璞瑅公馆	丰台	方庄	紫芳园五区	3室	203	2200	106000.0000
华远裘马四季	门头沟	大峪	增产路16号院	3室	156	950	55000.0000

? 可以看到，程序成功完成了对网站的爬取，并完成了数据最大值、最小值和中位数的提取，成功达到了题目要求。

2.作业二：雾霾指数数据分析

2.1 分析过程

? 首先读入csv文件，而后对读入数据进行预处理如下：

? ① 删除含有空PM值的行

? ②计算PM平均值：分别计算每一天PM值的和、PM值的数目，而后通过前两个值计算平均值，将这三个数据作为三个新的列加入到表格中。

? ③通过groupby和mean计算PM指数年平均值和10-15年PM指数和温度月平均值的变化情况。

2.2 核心代码

beijing_weather_process.py

import pandas as pd
# 读取文件

filename = 'BeijingPM20100101_20151231.csv'
df = pd.read_csv(filename, encoding = 'utf-8')
# 删除有空PM值的行
df.dropna(axis=0, how='all', subset=['PM_Dongsi','PM_Dongsihuan','PM_Nongzhanguan', 'PM_US Post'], inplace=True)
# 计算PM平均值
df['PMsum'] = df[['PM_Dongsi','PM_Dongsihuan','PM_Nongzhanguan', 'PM_US Post']].sum(axis=1)
df['PMcount'] = df[['PM_Dongsi','PM_Dongsihuan','PM_Nongzhanguan', 'PM_US Post']].count(axis=1)
df['PMave']=round(df['PMsum']/df['PMcount'],2)
aveY_df = df.groupby('year').mean()
aveM_df = df.groupby(['year', 'month']).mean()

print(aveY_df.head())
# 删除不必要的列并将PM年平均值输出到文件
aveY_df.drop(list(aveY_df.columns[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]]) , axis = 1 , inplace = True)
aveY_df.to_csv('PM_year_Ave.csv')
# 删除不必要的列并将PM和TEMP月平均值输出到文件
aveM_df.drop(list(aveM_df.columns[[0,1,2,3,4,5,6,7,8,9,10,12,13,14]]) , axis = 1 , inplace = True)
aveM_df.to_csv('PM_TEMP_month_Ave.csv')
# 输出PM年平均值
print(aveY_df)
print('------------------------------------------')
# 输出PM和TEMP月平均值
print(aveM_df)

2.3 最终结果及分析结论

? 最终生成两个csv文件，分别为’PM_year_Ave.csv’存储PM年平均值和’PM_TEMP_month_Ave.csv’存储PM和TEMP月平均值。

2.3.1 PM_year_Ave.csv

year	PMsum	PMcount	PMave
2010	104.0457	1	104.0457
2011	99.09324	1	99.09324
2012	90.53877	1	90.53877
2013	338.6743	3.579963	98.40268
2014	358.0453	3.876812	93.91771
2015	306.447	3.557648	85.85894

2.3.2 PM_TEMP_month_Ave.csv

截取前50行如下：

year	month	TEMP	PMsum	PMcount	PMave
2010	1	-6.37156	90.40367	1	90.40367
2010	2	-1.91505	97.23994	1	97.23994
2010	3	2.997179	94.04654	1	94.04654
2010	4	10.8078	80.07242	1	80.07242
2010	5	20.85346	87.07191	1	87.07191
2010	6	24.49381	109.0389	1	109.0389
2010	7	27.72984	123.4261	1	123.4261
2010	8	25.34763	97.68343	1	97.68343
2010	9	22.24145	122.7927	1	122.7927
2010	10	12.28302	118.7844	1	118.7844
2010	11	3.308735	138.384	1	138.384
2010	12	-2.05787	97.11575	1	97.11575
2011	1	-5.54383	44.8737	1	44.8737
2011	2	-0.85417	150.2902	1	150.2902
2011	3	6.966346	57.99199	1	57.99199
2011	4	14.69646	91.72067	1	91.72067
2011	5	20.73315	65.10815	1	65.10815
2011	6	25.64416	108.7947	1	108.7947
2011	7	26.46081	107.3865	1	107.3865
2011	8	25.66375	103.7338	1	103.7338
2011	9	19.2267	94.9694	1	94.9694
2011	10	13.19968	145.5568	1	145.5568
2011	11	5.94965	109.435	1	109.435
2011	12	-2.30686	108.7214	1	108.7214
2012	1	-4.94328	118.9224	1	118.9224
2012	2	-2.57391	84.44203	1	84.44203
2012	3	5.068919	96.47432	1	96.47432
2012	4	15.46314	87.83588	1	87.83588
2012	5	21.93488	90.96671	1	90.96671
2012	6	24.3291	96.63418	1	96.63418
2012	7	26.55959	80.64971	1	80.64971
2012	8	25.54735	81.16533	1	81.16533
2012	9	20.11517	59.95225	1	59.95225
2012	10	13.30811	94.95135	1	94.95135
2012	11	3.691977	87.43696	1	87.43696
2012	12	-4.35342	109.1873	1	109.1873
2013	1	-5.37568	366.5189	1.939189	183.1953
2013	2	-1.82143	331.4435	2.90625	113.5665
2013	3	5.405914	441.4798	3.767473	114.5728
2013	4	12.24861	243.9167	3.883333	63.04781
2013	5	21.45565	345.543	3.893817	89.14853
2013	6	23.67778	393.0306	3.533333	111.3548
2013	7	27.08221	272.3598	3.633423	74.93284
2013	8	26.57124	257.543	3.836022	67.92363
2013	9	20.125	329.6417	3.890278	85.71788
2013	10	12.82124	388.2151	3.803763	102.2088
2013	11	5.913889	339.3264	3.983333	85.14629
2013	12	-0.29301	352.7715	3.846774	90.31777
2014	1	-0.91398	424.0255	3.900538	107.9117
2014	2	-0.70238	580.2366	3.816964	160.5139

? 可以看到，程序成功完成了题目要求，生成了PM年平均值和PM-温度月平均值的变化情况表格。
3.890278 | 85.71788 |
| 2013 | 10 | 12.82124 | 388.2151 | 3.803763 | 102.2088 |
| 2013 | 11 | 5.913889 | 339.3264 | 3.983333 | 85.14629 |
| 2013 | 12 | -0.29301 | 352.7715 | 3.846774 | 90.31777 |
| 2014 | 1 | -0.91398 | 424.0255 | 3.900538 | 107.9117 |
| 2014 | 2 | -0.70238 | 580.2366 | 3.816964 | 160.5139 |

? 可以看到，程序成功完成了题目要求，生成了PM年平均值和PM-温度月平均值的变化情况表格。