一、介绍
主要是通过python的fake_useragent, asyncio, aiohttp,requests,lxml,pandas进行信息的抓取。 对于信息要通过获取Xpath得到对应的信息位置。使用异步方法。
二、代码(部分)
1. 异步方法进行请求
async def request(self, url):
async with aiohttp.ClientSession() as session:
try:
async with session.get(url, headers = self.headers,timeout=3) as response:
if response.status == 200 :
result = await response.text()
return result
except Exception as e:
print(e.args)
2. 发送请求获得页数
def get_page_all(self,city):
city_letter = self.get_city_letter(city)
url =
response = requests.get(url,headers=self.headers)
if response.status_code == 200:
html = etree.HTML(response.text)
page_all = html.xpath('//*[@id="content"]/div[1]/div[2]/@data-totalpage')[0]
print("租房信息获取成功!")
return int(page_all) + 1
else:
print('获取所有页码请求未成功!')
3. 异步方法,根据总页数循环解析页面抓取信息,包括标题、区域、面积、楼层、价格
async def parse_data_all(self, page_all, city):
for i in range(1,page_all):
city_letter = self.get_city_letter(city)
if i == 1:
url = 'https://{}.lianjia.com/zufang/ab200301001000rt200600000001'.format(city_letter, i)
else:
url = 'https://{}.lianjia.com/zufang/ab200301001000pg{}rt200600000001'.format(city_letter,i)
html_text = await self.request(url)
html = etree.HTML(html_text)
print('获取'+str(i)+'页信息!')
title_all = html.xpath('//*[@id="content"]/div[1]/div[1]/div/div/p[1]/a/text()')
big_region_all = html.xpath('//*[@id="content"]/div[1]/div[1]/div/div/p[2]/a[1]/text()')
small_region_all = html.xpath('//*[@id="content"]/div[1]/div[1]/div/div/p[2]/a[2]/text()')
square_all = html.xpath('//*[@id="content"]/div[1]/div[1]/div/div/p[2]/text()[5]')
price_all = html.xpath('//*[@id="content"]/div[1]/div[1]/div/div/span/em/text()')
room_all = html.xpath('//*[@id="content"]/div[1]/div[1]/div/div/p[2]/text()[1]')
title_list = self.remove_spaces(title_all)
big_region_list = self.remove_spaces(big_region_all)
small_region_list = self.remove_spaces(small_region_all)
square_list = self.remove_spaces(square_all)
price_list = self.remove_spaces(price_all)
data_page = {'标题':title_list,
'区名':big_region_list,
'小区/位置':small_region_list,
'square':square_list,
'price':price_list}
print('写入第'+str(i)+'页数据!')
df = pandas.DataFrame(data_page)
df.to_csv('{}租房信息.csv'.format(city),mode = 'a',encoding='utf_8_sig',index=None)
|