爬虫
网页采集器
import requests
if __name__ == '__main__':
headers = {
'User-Agent':'用抓包填写模拟浏览器地址'
}
url = 'https://www.sogou.com/web'
kw = input('enter a word:')
param = {
'query':kw
}
response = requests.get(url=url, params=param, headers= headers)
page_text = response.text
fileName = kw+'.html'
with open(fileName, 'w', encoding='utf-8') as fp:
fp.write(page_text)
print(fileName, '保存成功')
问题
到这该学fiddler抓包了
解决了,稍微了解抓包,用f12就可以进行模拟浏览器
破解百度翻译
post请求(携带参数)
响应数据是json数据
import requests
import json
if __name__ =='__main__':
post_url = 'https://fanyi.baidu.com/sug'
word = input('请输入一个单词:')
data = {
'kw':word
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
}
response = requests.post(url=post_url, data=data)
dic_obj = response.json()
print(dic_obj)
filename = word+'.json'
with open(filename, 'w', encoding='utf-8') as fp:
json.dump(dic_obj, fp=fp, ensure_ascii=False)
print('over')
豆瓣电影
import requests
import json
if __name__ =='__main__':
url = 'https://movie.douban.com/j/chart/top_list'
param = {
'type': '24',
'interval_id': '100:90',
'action': '',
'start': '0',
'limit': '20'
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
}
response = requests.get(url=url, params=param, headers=headers)
list_data = response.json()
with open('.\douban.json', 'w', encoding='utf-8') as fp:
json.dump(list_data, fp=fp, ensure_ascii=False)
print('over')
肯德基位置信息
import requests
import json
if __name__ =='__main__':
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
loaltion = input('Please input loaltion:')
param = {
'cname': '',
'pid': '',
'keyword': loaltion,
'pageIndex': '1',
'pageSize': '10'
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
}s
response = requests.get(url=url, params=param, headers=headers)
page_text = response.text
filename = loaltion + '.text'
with open(filename, 'w', encoding='utf-8') as fp:
fp.write(page_text)
print('over')
药监总局相关数据
- 动态加载
- 首页中对应的企业数据信息是ajex动态请求的
- 通过对详情页url观察发现:
- url域名相同,只有携带的参数不同
- id值可以从首页对应的ajex请求到
- 域名和id值拼接一个完整的企业详情页url
观察发现:
? 所有详情页请求的url多一样,只有参数id不同
获取企业ij后, 就可以将url与id拼接形成完整的详情页数据对应的ajax请求的url
数据解析
聚焦爬虫:正则 bs4、xpath
持续更新中…
|