现在的lj已经不再公示房屋交易记录,所以只能爬一爬房屋基本信息。
?
用到的模块还是挺多的,一波导入
import requests
import random
from pyquery import PyQuery as pq
import re
import pymysql
from pymysql.converters import escape_string
import threading
import time
#自行查找ua
uas = []
ua = random.choice(uas)
#请求头
headers = {
'User-Agent':ua,
'Host':'bj.lianjia.com',
'Referer':'https://bj.lianjia.com/ershoufang/dongcheng/pg2/'
}
#这里放ip 格式ip:port
li = []
proxy = f'账号:密码@{random.choice(li)}'
proxies = {
'http': 'http://' + proxy,
'https': 'http://' + proxy,
}
#注意提前创建database和table
conn = pymysql.connect(user='root',password='123123123',host='localhost',port=3306,database='lianjia')
cursor = conn.cursor()
根据首页获取最大页数pagesize。这个过程非常麻烦,包含页码的a节点无法提取出来,最后不得不另辟蹊径用pyquery+re组合提取出来?
# get page size
def get_pagesize():
res = requests.get(url=url,headers=headers)
res.encoding = 'utf8'
html = res.text
doc = pq(html)
pagesize_list = doc('.page-box.house-lst-page-box').attr('page-data')
str_ps_list = str(pagesize_list)
#print(pagesize_list)
ps_pattern = re.compile('"totalPage":(\d+)')
ps = ps_pattern.search(str_ps_list).group(1)
#print(ps)
return ps
接下来定义解析函数。
【注意】因为在sql语句中没有直接用%加入相应变量,而是在execute中分开(sql,values),所以sql语句中的%s不添加双引号,负责会报错syntax error?
另【注意】lock.acquire的位置千万别放进for li 循环中,否则报错。
def parse_listpage(url):
time.sleep(0.25)
try:
res = requests.get(url=url,headers=headers,proxies=proxies)
except:
print('Error')
res = requests.get(url=url,headers=headers,proxies=proxies)
semaphore.release()
res.encoding = 'utf8'
html = res.text
doc = pq(html)
#print(doc)
sell_list = doc('.sellListContent')
detail_list = sell_list('.clear.LOGVIEWDATA.LOGCLICKDATA').items()
lock.acquire()
for li in detail_list:
#print(li,'\n')
title = li('.title').text()
#position 继续拆分为街道和区域
position = li('.flood').text()
street = position.split('-')[0].strip()
street = escape_string(street)
region = position.split('-')[1].strip()
#house info 继续拆分7部分
house_info = li('.address').text()
house_info = house_info.split('|')
layout = house_info[0].strip()
area = house_info[1].strip()
area = escape_string(area)
orientation = house_info[2].strip()
deco = house_info[3].strip()
floor = house_info[4].strip()
floor = escape_string(floor)
if len(house_info) > 5:
built_time = house_info[5].strip()
else:
built_time = 'None'
if len(house_info) > 6:
built_structure = house_info[6].strip()
else:
built_structure = 'None'
tag = li('.tag').text()
price = li('.totalPrice.totalPrice2').text()
unit_price = li('.unitPrice').text()
unit_price = escape_string(unit_price)
print(title,'--',street,'--',region,'--',layout,
'--',area,'--',orientation,'--',deco,'--',
floor,'--',built_time,'--',built_structure,'--',
tag,'--',price,'--',unit_price)
sql = "insert into bj_chaoyang(title,street,region,layout,area,orientation,deco,floor,built_time,built_structure,tag,price,unit_price) \
values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
values = (title,street,region,layout,area,orientation,deco,floor,built_time,built_structure,tag,price,unit_price)
cursor.execute(sql,values)
conn.commit()
lock.release()
主函数。
注意pagesize在列表解析中要int一下,因为re提取出来是str
另外rt.join()阻塞线程,免得一个子线程结束了先把conn给关掉了那么后面数据就没法进数据库了。
if __name__ == '__main__':
url = 'https://bj.lianjia.com/ershoufang/chaoyang/'
#parse_listpage(url)
# num of last page
pagesize = get_pagesize()
#print(pagesize)
urls = [f'https://bj.lianjia.com/ershoufang/chaoyang/pg{i}/' for i in range(2,int(pagesize)+1)]
print(urls)
lock = threading.Lock()
li_rt = []
semaphore = threading.BoundedSemaphore(5)
for url in urls:
print(url,'\n')
semaphore.acquire()
t= threading.Thread(target=parse_listpage,args=(url,))
t.start()
li_rt.append(t)
for rt in li_rt:
rt.join()
conn.close()
成果展示
|