使用python进行数据下载
使用urllib进行下载
在使用python作为爬虫与下载工具时,urllib 是个常用的包,其核心下载方法使用起来非常简单,只需一个方法:
- urllib.request.urlretrieve():
urllib.request.urlretrieve(url, filename=None, reporthook=None, data=None)
"""
@filename: 指定的存储路径
@reporthook: 指定的回调函数, 包含block number, a read size, and the total file size
"""
- 实际上简单使用只需要一行代码:
import urllib
urllib.request.urlretrieve(download_url, save_path)
而当我们需要显示下载进度时,可以在reporthook 中传入一个回调函数,这个回调函数接口是固定的,会传入三个参数分别为:当前已经下载的块,每次传输的块大小以及文件总大小。
- reporthook的简单实现
def schedule(blocknum,blocksize,totalsize):
"""
* 用于urllib.request.urlretrieve方法的回调函数
@ blocknum:当前已经下载的块
@ blocksize:每次传输的块大小
@ totalsize:网页文件总大小
"""
if totalsize == 0:
percent = 0
else:
percent = blocknum * blocksize / totalsize
if percent > 1.0:
percent = 1.0
percent = percent * 100
print("download %s : %.4f%%" %(base_name, percent))
使用requests进行下载
除了urllib 外使用requests 模块也是一个常用的方法。使用requests 时,多数的操作需要手动完成,但也因此在实行断点续传等操作时也更加方便。
- 使用requests进行数据下载:
r1 = requests.get(download_url, stream=True, verify=False)
total_size = int(r1.headers['Content-Length'])
if os.path.exists(video_save_path):
temp_size = os.path.getsize(video_save_path)
else:
temp_size = 0
headers = {'Range': 'bytes=%d-' % temp_size}
r = requests.get(download_url, stream=True, verify=False, headers=headers)
with open(video_save_path, "ab") as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
temp_size += len(chunk)
f.write(chunk)
f.flush()
print("download %s : %d / %d || %.4f%%" %(base_name, temp_size, total_size, temp_size/total_size))
多进程
python提供了便利的多进程方法,这里我们采用了最简单的方式:
- 多进程
def download():
with open('download_urls.txt', 'r', encoding='utf-8') as f:
urls = f.readlines()
f.close()
process = []
for url in urls:
process.append(Process(target=_download, args=[url.strip()]))
[p.start() for p in process]
[p.join() for p in process]
完整代码
"""
* 提供了多进程下载
* 使用urllib.request包,当下载失败时自动重新下载
"""
import os
import os
import urllib.request
import socket
import sys
import logging
socket.setdefaulttimeout(30)
from multiprocessing import Process, Queue
logging.basicConfig(level=logging.DEBUG
,filename="demo.log"
,filemode="w"
,format="%(asctime)s - %(name)s - %(levelname)-9s - %(filename)-8s : %(lineno)s line - %(message)s"
,datefmt="%Y-%m-%d %H:%M:%S"
)
def _download(*args):
"""
* 子进程函数,负责从每一个传入的url下载文件
"""
logger=logging.getLogger()
fh = logging.FileHandler('log.txt', mode='a', encoding='utf-8', delay=False)
logger.addHandler(fh)
download_url = args[0]
base_name = os.path.basename(download_url)
video_save_path = os.path.join('save/', base_name)
def schedule(blocknum,blocksize,totalsize):
"""
* 用于urllib.request.urlretrieve方法的回调函数,接口是固定的
@ blocknum:当前已经下载的块
@ blocksize:每次传输的块大小
@ totalsize:网页文件总大小
"""
if totalsize == 0:
percent = 0
else:
percent = blocknum * blocksize / totalsize
if percent > 1.0:
percent = 1.0
percent = percent * 100
print("download %s : %.4f%%" %(base_name, percent))
while True:
try:
logger.info('downloding %s ...'%download_url)
urllib.request.urlretrieve(download_url, video_save_path, schedule)
except Exception as e:
logger.info('exception has occured in downloading {}: {}'.format(download_url, e))
continue
break
def download():
with open('download.txt', 'r', encoding='utf-8') as f:
lines = f.readlines()
f.close()
process = []
for line in lines:
process.append(Process(target=_download, args=[line.strip()]))
[p.start() for p in process]
[p.join() for p in process]
if __name__ == '__main__':
download()
|