使用python抓取双色球所有中奖记录(出现的问题)

1、在调试器中匹配的Xpath没有问题,但是用python爬取的时却显示空白

原因时因为,在调试器中和在源码中缺少了一个tbody标签

调试器中的代码
源代码中的 代码
import datetime
import openpyxl
import requests
import time
from lxml import etree


def get_url_html(url):
    """
    定义一个函数, 新建一个空变量html_str, 请求网页获取网页源码,如果请求成功,则返回结果,如果失败则返回空值
    url: 入参参数, 指的是我们普通浏览器中的访问网址
    """
    headers = {
        "connection": "keep-alive",
        "Cookie": "JSESSIONID=E4674C29E2A76CB08BB651053D8C951E.bswa3n; wipo-visitor-uunid=ff51e08378c28600; "
                      "_gcl_au=1.1.661799052.1650246701; _ga=GA1.3.1709262595.1650246701; "
                      "_pk_ref.14.ec75=%5B%22%22%2C%22%22%2C1650442707%2C%22https%3A%2F%2Fwww3.wipo.int%2F%22%5D; "
                      "_hjSessionUser_787562=eyJpZCI6IjhhNGViODJkLTFiNTEtNWNmNC1iMDc0LTliNDRiZGJkYTlhZCIsImNyZWF0ZWQ"
                      "iOjE2NTA0NDI3MDc5NTAsImV4aXN0aW5nIjpmYWxzZX0=; "
                      "_pk_id.14.ec75=845d6b854d46c8ec.1650440759.2.16504 "
                      "42818.1650440759.; _gid=GA1.3.807169207.1650806717; _gid=GA1.2.807169207.1650806717; "
                      "_ga=GA1.1.17092 "
                      "62595.1650246701; _pk_id.9.ec75=3222e84a40150571.1650246702.; "
                      "_pk_id.9.d630=a4ae4c09b954546d.1650246701 "
                      ".; _pk_uid=0%3DczoxNjoiMzIyMmU4NGE0MDE1MDU3MSI7%3A_%3D4d811534abc282543fa0eeaad6da945e10b9c701"
                      "; _ga_15TSHJ0H "
                      "WP=GS1.1.1651022240.33.0.1651022878.0",

        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cache-Control": "no-cache",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36",
    }

    html_str = ""
    try:
        """获取网页请求之后,返回的网页源码,类似于在浏览器中右击选择网页源码, 使用三方库etree把网页源码字符串转换成HTML格式"""
        # 如果状态 码为200,则请求成功,返回网页源码
        if requests.get(url, headers=headers).status_code == 200:
            r = requests.get(url, timeout=2000, headers=headers)
            # 该操作为使用网页的编码方式,防止数据乱码
            r.encoding = r.apparent_encoding
            html_str = etree.HTML(r.text)
        # 如果状态码不是200,则刷新页面再次请求
        # 否则请求失败,再次请求
        else:
            time.sleep(1)
            html_str = etree.HTML(requests.get(url, headers=headers).text)
    except:
        print("请求失败")
    return html_str


# 获取https://kaijiang.zhcw.com/网页上的数据
def get_html_str(html_str):
    # data_list中的数据为未删除空白行的数据
    data_list = []
    try:
        # 查找网页中每一行的数据
        # tr_list = html_str.xpath('//*[@id="content"]/div[2]/div[2]/div[2]/table/tbody/tr')
        # 根据网页中的源码,确认表格所在的div等数据
        option = html_str.xpath('//table/tr[position()>=3][position()<21]')
        # print(option)
        for op in option:
            # 根据每一行,匹配出其他6列的数据
            try:
                col1 = str(op.xpath("./td[1]/text()")[0])
            except:
                col1 = ""
            try:
                col2 = str(op.xpath("./td[2]/text()")[0])
            except:
                col2 = ""
            try:
                col3 = str(op.xpath("./td[3]/em/text()"))
            except:
                col3 = ""
            try:
                col4 = str(op.xpath("./td[4]/strong/text()")[0])
            except:
                col4 = ""
            try:
                col5 = str(op.xpath("./td[5]/strong/text()")[0])
            except:
                col5 = ""
            try:
                col6 = str(op.xpath("./td[6]/strong/text()")[0])
            except:
                col6 = ""
            data_list.append([col1.strip(), col2.strip(), col3.strip(
            ), col4.strip(), col5.strip(), col6.strip()])
    except Exception as e:
        print(e, url)
    return data_list


def write_excel(file_name, write_list):
    """
    定义一个函数, 将每一行的数据汇总的数组,进行遍历,依次写到excel中
    file_name: 入参参数, 指的是写入excel的名字
    write_list: 入参参数, 指的是写入excel的每一行汇总的数组
    """
    full_excel = openpyxl.Workbook()
    full_sheet = full_excel.active
    for i in range(0, len(write_list)):
        full_sheet.append(write_list[i])
    full_excel.save(file_name)


# 主函数
if __name__ == '__main__':
    start_time = datetime.datetime.now()

    """
    URL的规律是XXXX+当前日期+XXXX+当前页号
    """
    now_date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    every_page_result_list = [["开奖日期", "期号", "中奖号码",
                               "销售额(元)", "一等奖", "二等奖"]]  # 空数组接受每一页的所有数据行汇总数据
    """循环每一页获取数据"""
    # pages = 78
    pages = 143
    # 获取urls中的数据
    for index in range(1, pages+1):
        # 定义url
        url = "https://kaijiang.zhcw.com/zhcw/html/ssq/list_" + \
            str(index) + ".html"
        # 获取每一页的数据
        every_page_result_list = every_page_result_list + \
            get_html_str(get_url_html(url))
        # 延时1秒
        # time.sleep(3)
    # 将数据写到指定路径下的csv文件中
    write_excel(r"F:\PythonProject"+now_date+".csv", every_page_result_list)
    end_time = datetime.datetime.now()
    print(f"耗时总共{(end_time - start_time).seconds}秒")

整个代码

暂无评论

发送评论 编辑评论


				
|´・ω・)ノ
ヾ(≧∇≦*)ゝ
(☆ω☆)
(╯‵□′)╯︵┴─┴
 ̄﹃ ̄
(/ω\)
∠( ᐛ 」∠)_
(๑•̀ㅁ•́ฅ)
→_→
୧(๑•̀⌄•́๑)૭
٩(ˊᗜˋ*)و
(ノ°ο°)ノ
(´இ皿இ`)
⌇●﹏●⌇
(ฅ´ω`ฅ)
(╯°A°)╯︵○○○
φ( ̄∇ ̄o)
ヾ(´・ ・`。)ノ"
( ง ᵒ̌皿ᵒ̌)ง⁼³₌₃
(ó﹏ò。)
Σ(っ °Д °;)っ
( ,,´・ω・)ノ"(´っω・`。)
╮(╯▽╰)╭
o(*////▽////*)q
>﹏<
( ๑´•ω•) "(ㆆᴗㆆ)
😂
😀
😅
😊
🙂
🙃
😌
😍
😘
😜
😝
😏
😒
🙄
😳
😡
😔
😫
😱
😭
💩
👻
🙌
🖕
👍
👫
👬
👭
🌚
🌝
🙈
💊
😶
🙏
🍦
🍉
😣
Source: github.com/k4yt3x/flowerhd
颜文字
Emoji
小恐龙
花!
上一篇
下一篇