前言
最近在学习Python爬虫的编写,发现很多网站设置了ip限制,请求过于频繁会被ban ip,于是想到了代理池技术。
正文
请求代理池
这里我选用了一个国外的免费代理池,由于网页已经帮我们整理好了格式,所以不需要利用re模块去查找ip和端口了。 我采用requests库请求,并把未筛选的代理ip存为一个txt文档:
url = ""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55"
}
resp = requests.get(url, headers)
with open("1.txt", "w") as f1:
f1.write(resp.text)
筛选代理ip
先编写筛选单个ip的函数,再使用多线程筛选多个ip:
def check_ip(url,line):
proxy = line.split(":")
ip = proxy[0]
port = proxy[1]
check_proxy = {
"http":"http://"+ip+":"+port,
"socks5":"socks5://"+ip+":"+port
}
try:
proxy_resp = requests.get(url,check_proxy)
if proxy_resp.status_code == 200:
print("[+]当前代理:"+ip+":"+str(port)+"可用!")
proxy_list.append(ip+":"+port)
else:
print(print("[-]当前代理:"+ip+":"+port+"不可用!"))
except Exception as e:
print(e)
print(print("[-]当前代理:"+ip+":"+port+"不可用!"))
with open("1.txt","r") as f2:
for f in f2:
if f.startswith("#"):
continue
else:
f = f.strip()
t1 = threading.Thread(target=check_ip,args=(url,f))
t1.start()
t1.join()
保存可用的代理ip
with open("2.txt","w") as f3:
for pl1 in proxy_list:
f3.writelines(pl1+'\n')
把代理ip转发到本地(可选)
这里大家可写可不写,我纯粹是为了巩固所学知识(doge)。每次运行这段代码都会报“数组下标越界”的错误,还请各位大佬多多指导!
def portforward(prip,prpo):
global target_socket
server = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
try:
server.bind(('127.0.0.1',5320))
server.listen(10)
except socket.error as e:
print("[-]The local service : " + str(e))
return "[-]The local service : " + str(e)
while True:
try:
client, addr = server.accept()
print('[*]accept %s connect' % (addr,))
data = client.recv(1024)
if not data:
break
print('[*' + localtime + ']: Accept data...')
except socket.error as e:
print("[-]Local receiving client : " + str(e))
return "[-]Local receiving client : " + str(e)
while True:
target_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
print("[!]Now proxy ip:" + prip + str(prpo))
try:
target_socket.settimeout(3)
target_socket.connect((prip, prpo))
except:
print("[-]RE_Connect...")
continue
break
try:
target_socket.send(data)
except socket.error as e:
print("[-]Sent to the proxy server : " + str(e))
return "[-]Sent to the proxy server : " + str(e)
while True:
try:
data_1 = target_socket.recv(1024)
if not data_1:
break
print('[*' + localtime + ']: Send data...')
client.send(data_1)
except socket.timeout as e:
print(prip + ":" + str(prpo))
print("[-]Back to the client : " + str(e))
continue
client.close()
target_socket.close()
with open("2.txt","r") as f4:
f5 = f4.readlines()
print(f5)
for pl2 in f5:
pl2 = pl2.strip()
print(pl2)
if pl2.startswith('#'):
continue
pl2 = pl2.split(":")
print(pl2)
proxy_ip = pl2[0]
proxy_port = pl2[1]
t2 = threading.Thread(target=portforward,args=(proxy_ip,proxy_port))
t2.start()
t2.join()
最后贴出完整代码:
import socket
import requests
import threading
import time
proxy_list = []
localtime = time.asctime(time.localtime(time.time()))
def check_ip(url,line):
proxy = line.split(":")
ip = proxy[0]
port = proxy[1]
check_proxy = {
"http":"http://"+ip+":"+port,
"socks5":"socks5://"+ip+":"+port
}
try:
proxy_resp = requests.get(url,check_proxy)
if proxy_resp.status_code == 200:
print("[+]当前代理:"+ip+":"+str(port)+"可用!")
proxy_list.append(ip+":"+port)
else:
print(print("[-]当前代理:"+ip+":"+port+"不可用!"))
except Exception as e:
print(e)
print(print("[-]当前代理:"+ip+":"+port+"不可用!"))
def portforward(prip,prpo):
global target_socket
server = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
try:
server.bind(('127.0.0.1',5320))
server.listen(10)
except socket.error as e:
print("[-]The local service : " + str(e))
return "[-]The local service : " + str(e)
while True:
try:
client, addr = server.accept()
print('[*]accept %s connect' % (addr,))
data = client.recv(1024)
if not data:
break
print('[*' + localtime + ']: Accept data...')
except socket.error as e:
print("[-]Local receiving client : " + str(e))
return "[-]Local receiving client : " + str(e)
while True:
target_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
print("[!]Now proxy ip:" + prip + str(prpo))
try:
target_socket.settimeout(3)
target_socket.connect((prip, prpo))
except:
print("[-]RE_Connect...")
continue
break
try:
target_socket.send(data)
except socket.error as e:
print("[-]Sent to the proxy server : " + str(e))
return "[-]Sent to the proxy server : " + str(e)
while True:
try:
data_1 = target_socket.recv(1024)
if not data_1:
break
print('[*' + localtime + ']: Send data...')
client.send(data_1)
except socket.timeout as e:
print(prip + ":" + str(prpo))
print("[-]Back to the client : " + str(e))
continue
client.close()
target_socket.close()
if __name__ == '__main__':
url = ""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55"
}
resp = requests.get(url, headers)
with open("1", "w") as f1:
f1.write(resp.text)
with open("1.txt","r") as f2:
for f in f2:
if f.startswith("#"):
continue
else:
f = f.strip()
t1 = threading.Thread(target=check_ip,args=(url,f))
t1.start()
t1.join()
with open("2.txt","w") as f3:
for pl1 in proxy_list:
f3.writelines(pl1+'\n')
with open("2.txt","r") as f4:
f5 = f4.readlines()
print(f5)
for pl2 in f5:
pl2 = pl2.strip()
print(pl2)
if pl2.startswith('#'):
continue
pl2 = pl2.split(":")
print(pl2)
proxy_ip = pl2[0]
proxy_port = pl2[1]
t2 = threading.Thread(target=portforward,args=(proxy_ip,proxy_port))
t2.start()
t2.join()
总结
编写这个脚本,是对我学习知识的一个小总结。然而,这还不是终点。随着学习的深入,以后我会尝试使用面向对象的编程知识,写出更厉害的python工具和脚本!
|