使用python抓取双色球所有中奖记录（出现的问题）

1、在调试器中匹配的Xpath没有问题，但是用python爬取的时却显示空白
原因时因为，在调试器中和在源码中缺少了一个tbody标签
import datetime
import openpyxl
import requests
import time
from lxml import etree


def get_url_html(url):
    """
    定义一个函数, 新建一个空变量html_str， 请求网页获取网页源码，如果请求成功，则返回结果，如果失败则返回空值
    url: 入参参数, 指的是我们普通浏览器中的访问网址
    """
    headers = {
        "connection": "keep-alive",
        "Cookie": "JSESSIONID=E4674C29E2A76CB08BB651053D8C951E.bswa3n; wipo-visitor-uunid=ff51e08378c28600; "
                      "_gcl_au=1.1.661799052.1650246701; _ga=GA1.3.1709262595.1650246701; "
                      "_pk_ref.14.ec75=%5B%22%22%2C%22%22%2C1650442707%2C%22https%3A%2F%2Fwww3.wipo.int%2F%22%5D; "
                      "_hjSessionUser_787562=eyJpZCI6IjhhNGViODJkLTFiNTEtNWNmNC1iMDc0LTliNDRiZGJkYTlhZCIsImNyZWF0ZWQ"
                      "iOjE2NTA0NDI3MDc5NTAsImV4aXN0aW5nIjpmYWxzZX0=; "
                      "_pk_id.14.ec75=845d6b854d46c8ec.1650440759.2.16504 "
                      "42818.1650440759.; _gid=GA1.3.807169207.1650806717; _gid=GA1.2.807169207.1650806717; "
                      "_ga=GA1.1.17092 "
                      "62595.1650246701; _pk_id.9.ec75=3222e84a40150571.1650246702.; "
                      "_pk_id.9.d630=a4ae4c09b954546d.1650246701 "
                      ".; _pk_uid=0%3DczoxNjoiMzIyMmU4NGE0MDE1MDU3MSI7%3A_%3D4d811534abc282543fa0eeaad6da945e10b9c701"
                      "; _ga_15TSHJ0H "
                      "WP=GS1.1.1651022240.33.0.1651022878.0",

        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cache-Control": "no-cache",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36",
    }

    html_str = ""
    try:
        """获取网页请求之后，返回的网页源码，类似于在浏览器中右击选择网页源码, 使用三方库etree把网页源码字符串转换成HTML格式"""
        # 如果状态 码为200，则请求成功，返回网页源码
        if requests.get(url, headers=headers).status_code == 200:
            r = requests.get(url, timeout=2000, headers=headers)
            # 该操作为使用网页的编码方式，防止数据乱码
            r.encoding = r.apparent_encoding
            html_str = etree.HTML(r.text)
        # 如果状态码不是200，则刷新页面再次请求
        # 否则请求失败，再次请求
        else:
            time.sleep(1)
            html_str = etree.HTML(requests.get(url, headers=headers).text)
    except:
        print("请求失败")
    return html_str


# 获取https://kaijiang.zhcw.com/网页上的数据
def get_html_str(html_str):
    # data_list中的数据为未删除空白行的数据
    data_list = []
    try:
        # 查找网页中每一行的数据
        # tr_list = html_str.xpath('//*[@id="content"]/div[2]/div[2]/div[2]/table/tbody/tr')
        # 根据网页中的源码，确认表格所在的div等数据
        option = html_str.xpath('//table/tr[position()>=3][position()<21]')
        # print(option)
        for op in option:
            # 根据每一行，匹配出其他6列的数据
            try:
                col1 = str(op.xpath("./td[1]/text()")[0])
            except:
                col1 = ""
            try:
                col2 = str(op.xpath("./td[2]/text()")[0])
            except:
                col2 = ""
            try:
                col3 = str(op.xpath("./td[3]/em/text()"))
            except:
                col3 = ""
            try:
                col4 = str(op.xpath("./td[4]/strong/text()")[0])
            except:
                col4 = ""
            try:
                col5 = str(op.xpath("./td[5]/strong/text()")[0])
            except:
                col5 = ""
            try:
                col6 = str(op.xpath("./td[6]/strong/text()")[0])
            except:
                col6 = ""
            data_list.append([col1.strip(), col2.strip(), col3.strip(
            ), col4.strip(), col5.strip(), col6.strip()])
    except Exception as e:
        print(e, url)
    return data_list


def write_excel(file_name, write_list):
    """
    定义一个函数, 将每一行的数据汇总的数组，进行遍历，依次写到excel中
    file_name: 入参参数, 指的是写入excel的名字
    write_list: 入参参数, 指的是写入excel的每一行汇总的数组
    """
    full_excel = openpyxl.Workbook()
    full_sheet = full_excel.active
    for i in range(0, len(write_list)):
        full_sheet.append(write_list[i])
    full_excel.save(file_name)


# 主函数
if __name__ == '__main__':
    start_time = datetime.datetime.now()

    """
    URL的规律是XXXX+当前日期+XXXX+当前页号
    """
    now_date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    every_page_result_list = [["开奖日期", "期号", "中奖号码",
                               "销售额（元）", "一等奖", "二等奖"]]  # 空数组接受每一页的所有数据行汇总数据
    """循环每一页获取数据"""
    # pages = 78
    pages = 143
    # 获取urls中的数据
    for index in range(1, pages+1):
        # 定义url
        url = "https://kaijiang.zhcw.com/zhcw/html/ssq/list_" + \
            str(index) + ".html"
        # 获取每一页的数据
        every_page_result_list = every_page_result_list + \
            get_html_str(get_url_html(url))
        # 延时1秒
        # time.sleep(3)
    # 将数据写到指定路径下的csv文件中
    write_excel(r"F:\PythonProject"+now_date+".csv", every_page_result_list)
    end_time = datetime.datetime.now()
    print(f"耗时总共{(end_time - start_time).seconds}秒")
整个代码
发送评论 编辑评论

发送评论编辑评论