[Python知识库] python爬虫之爬取天气预报

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> Python知识库 -> python爬虫之爬取天气预报 -> 正文阅读

[Python知识库]python爬虫之爬取天气预报

方法一

"""
需求：
1.爬取全国所有的城市名称以及对应的气温
2.保存所有的城市名称以及对应的气温到为csv文件

目标url:
1.华北地区：http://www.weather.com.cn/textFC/hb.shtml
2.东北地区：http://www.weather.com.cn/textFC/db.shtml
3.华东地区：http://www.weather.com.cn/textFC/hd.shtml
4.华中地区：http://www.weather.com.cn/textFC/hz.shtml
5.华南地区：http://www.weather.com.cn/textFC/hn.shtml
6.西北地区：http://www.weather.com.cn/textFC/xb.shtml
7.西南地区：http://www.weather.com.cn/textFC/xn.shtml
8.港澳台地区：http://www.weather.com.cn/textFC/gat.shtml
规律： 'http://www.weather.com.cn/textFC/' + dq_name + '.shtml' 其中，dq_name = [hb,db,hd,hz,hn,xb,xn,gat]
"""

# #2. 获取网页源代码
import requests
def get_source(url):
    response=requests.get(url)
    response.encoding='utf-8'
    return response.text
# 3.分析天气爬取规律
from bs4 import BeautifulSoup
def get_info(source):
    # 解决网页乱码，添加'html5lib'，而不是lxml
    soup = BeautifulSoup(source, 'html5lib')  # pip install html5lib
    # 1.进入整体表格
    conMidtab = soup.find('div', class_='conMidtab')
    # 2.进入子表格
    tables = conMidtab.find_all('table')
    # 3.进入每个子表格收集天气信息
    info = []
    for table in tables:
        # (1)过滤前两个（城市和时间）
        trs = table.find_all('tr')[2:]  # tr存储了每个城市的天气信息
        # enumerate 返回2个值第一个是下标 第二个下标所对应的元素
        # (2)进入每个城市（每一行），判断是否是省会
        for index,tr in enumerate(trs):
            tds = tr.find_all('td')  # td存储每个城市天气信息的每个具体项目
            # 城市名字判断：因为对于每个省份的第一行的第一列为省名，对应不了省会。爬取会出错，因而要判断修改
            city_td = tds[0] # 城市
            if index == 0:  #  index==0，代表的是第一个tr，第一个城市
                city_td = tds[1] # 省会
        # (3)获取每个城市的具体天气项目
            city = list(city_td.stripped_strings)[0]  # 城市名字
            # 该城市最高气温
            temp_high_td = tds[-5]
            temp_high = list(temp_high_td.stripped_strings)[0]
            # 该城市最低气温
            temp_low_td = tds[-2]
            temp_low = list(temp_low_td.stripped_strings)[0]
            # print('城市:', city, '最高气温:', temp_high,'最低气温:',temp_low)
            item = city,temp_high,temp_low
            info.append(item)
        return info  # 存储在info内部
import  csv
def save_weather(info):
    with open('weatherinfo.csv','w',encoding='UTF-8',newline='') as f:
        filenames=['city','temp_high','temp_low']
        writer=csv.DictWriter(f,fieldnames=filenames)
        writer.writeheader()
        for each_city in info:
            each_city=list(each_city)
            dict_info=dict(zip(filenames,each_city))
            writer.writerow(dict_info)
# 5.整个函数
def main():
    info_list = []
    dq_names = ['hb', 'db', 'hd', 'hz', 'hn', 'xb', 'xn', 'gat']
    for dq_name in dq_names:
        url = 'http://www.weather.com.cn/textFC/' + dq_name + '.shtml'
        source = get_source(url)
        info = get_info(source)
        info_list += info
    save_weather(info_list)

if __name__ == '__main__':
    main()

方法二

import requests

from bs4 import BeautifulSoup

# 定义一个函数来解析网页
def parse_page(url):

    response = requests.get(url)
    # 解决乱码
    text = response.content.decode('utf-8')
    soup = BeautifulSoup(text,'html5lib') # pip install html5lib
    # 网页接卸
    # 一、class="conMidtab"
    conMidtab = soup.find('div',class_='conMidtab')
    # print(conMidtab)
    # 二、table
    tables = conMidtab.find_all('table')
    # print(tables)

    for table in tables:
        # print(table)
        # 三、tr 过滤掉去前2个
        trs = table.find_all('tr')[2:]
        # enumerate 返回2个值第一个是下标 第二个下标所对应的元素
        for index,tr in enumerate(trs):
            # print(tr)
            tds = tr.find_all('td')

            # 判断
            city_td = tds[0] # 城市

            if index == 0:
                city_td = tds[1] # 省会


            # 获取一个标签下面的子孙节点的文本信息
            city = list(city_td.stripped_strings)[0]

            temp_td = tds[-2]
            temp = list(temp_td.stripped_strings)[0]
            print('城市:',city,'温度:',temp)
        # break # 先打印北京

    # 四、td

    # print(text)


def main():

    url = 'http://www.weather.com.cn/textFC/hb.shtml' # 华东
    # url = 'http://www.weather.com.cn/textFC/db.shtml' # 东北
    url = 'http://www.weather.com.cn/textFC/gat.shtml' # 港澳台

    urls = ['http://www.weather.com.cn/textFC/hb.shtml','http://www.weather.com.cn/textFC/db.shtml' ,'http://www.weather.com.cn/textFC/gat.shtml']

    for url in urls:
        parse_page(url)


if __name__ == '__main__':


    main()

Python知识库最新文章

Python中String模块

【Python】 14-CVS文件操作

python的panda库读写文件

使用Nordic的nrf52840实现蓝牙DFU过程

【Python学习记录】numpy数组用法整理

Python学习笔记

python字符串和列表

python如何从txt文件中解析出有效的数据

Python编程从入门到实践自学/3.1-3.2

python变量

加:2021-09-06 11:06:03 更:2021-09-06 11:07:17

360图书馆购物三丰科技阅读网日历万年历 2025年7日历

-2025/7/2 14:56:09-

图片自动播放器
↓图片自动播放器↓

TxT小说阅读器
↓语音阅读,小说下载,古典文学↓

一键清除垃圾
↓轻轻一点,清除系统垃圾↓

图片批量下载器
↓批量下载图片,美女图库↓

网站联系: qq:121756557 email:121756557@qq.com IT数码