1、在调试器中匹配的Xpath没有问题,但是用python爬取的时却显示空白
原因时因为,在调试器中和在源码中缺少了一个tbody标签
import datetime
import openpyxl
import requests
import time
from lxml import etree
def get_url_html(url):
"""
定义一个函数, 新建一个空变量html_str, 请求网页获取网页源码,如果请求成功,则返回结果,如果失败则返回空值
url: 入参参数, 指的是我们普通浏览器中的访问网址
"""
headers = {
"connection": "keep-alive",
"Cookie": "JSESSIONID=E4674C29E2A76CB08BB651053D8C951E.bswa3n; wipo-visitor-uunid=ff51e08378c28600; "
"_gcl_au=1.1.661799052.1650246701; _ga=GA1.3.1709262595.1650246701; "
"_pk_ref.14.ec75=%5B%22%22%2C%22%22%2C1650442707%2C%22https%3A%2F%2Fwww3.wipo.int%2F%22%5D; "
"_hjSessionUser_787562=eyJpZCI6IjhhNGViODJkLTFiNTEtNWNmNC1iMDc0LTliNDRiZGJkYTlhZCIsImNyZWF0ZWQ"
"iOjE2NTA0NDI3MDc5NTAsImV4aXN0aW5nIjpmYWxzZX0=; "
"_pk_id.14.ec75=845d6b854d46c8ec.1650440759.2.16504 "
"42818.1650440759.; _gid=GA1.3.807169207.1650806717; _gid=GA1.2.807169207.1650806717; "
"_ga=GA1.1.17092 "
"62595.1650246701; _pk_id.9.ec75=3222e84a40150571.1650246702.; "
"_pk_id.9.d630=a4ae4c09b954546d.1650246701 "
".; _pk_uid=0%3DczoxNjoiMzIyMmU4NGE0MDE1MDU3MSI7%3A_%3D4d811534abc282543fa0eeaad6da945e10b9c701"
"; _ga_15TSHJ0H "
"WP=GS1.1.1651022240.33.0.1651022878.0",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36",
}
html_str = ""
try:
"""获取网页请求之后,返回的网页源码,类似于在浏览器中右击选择网页源码, 使用三方库etree把网页源码字符串转换成HTML格式"""
# 如果状态 码为200,则请求成功,返回网页源码
if requests.get(url, headers=headers).status_code == 200:
r = requests.get(url, timeout=2000, headers=headers)
# 该操作为使用网页的编码方式,防止数据乱码
r.encoding = r.apparent_encoding
html_str = etree.HTML(r.text)
# 如果状态码不是200,则刷新页面再次请求
# 否则请求失败,再次请求
else:
time.sleep(1)
html_str = etree.HTML(requests.get(url, headers=headers).text)
except:
print("请求失败")
return html_str
# 获取https://kaijiang.zhcw.com/网页上的数据
def get_html_str(html_str):
# data_list中的数据为未删除空白行的数据
data_list = []
try:
# 查找网页中每一行的数据
# tr_list = html_str.xpath('//*[@id="content"]/div[2]/div[2]/div[2]/table/tbody/tr')
# 根据网页中的源码,确认表格所在的div等数据
option = html_str.xpath('//table/tr[position()>=3][position()<21]')
# print(option)
for op in option:
# 根据每一行,匹配出其他6列的数据
try:
col1 = str(op.xpath("./td[1]/text()")[0])
except:
col1 = ""
try:
col2 = str(op.xpath("./td[2]/text()")[0])
except:
col2 = ""
try:
col3 = str(op.xpath("./td[3]/em/text()"))
except:
col3 = ""
try:
col4 = str(op.xpath("./td[4]/strong/text()")[0])
except:
col4 = ""
try:
col5 = str(op.xpath("./td[5]/strong/text()")[0])
except:
col5 = ""
try:
col6 = str(op.xpath("./td[6]/strong/text()")[0])
except:
col6 = ""
data_list.append([col1.strip(), col2.strip(), col3.strip(
), col4.strip(), col5.strip(), col6.strip()])
except Exception as e:
print(e, url)
return data_list
def write_excel(file_name, write_list):
"""
定义一个函数, 将每一行的数据汇总的数组,进行遍历,依次写到excel中
file_name: 入参参数, 指的是写入excel的名字
write_list: 入参参数, 指的是写入excel的每一行汇总的数组
"""
full_excel = openpyxl.Workbook()
full_sheet = full_excel.active
for i in range(0, len(write_list)):
full_sheet.append(write_list[i])
full_excel.save(file_name)
# 主函数
if __name__ == '__main__':
start_time = datetime.datetime.now()
"""
URL的规律是XXXX+当前日期+XXXX+当前页号
"""
now_date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
every_page_result_list = [["开奖日期", "期号", "中奖号码",
"销售额(元)", "一等奖", "二等奖"]] # 空数组接受每一页的所有数据行汇总数据
"""循环每一页获取数据"""
# pages = 78
pages = 143
# 获取urls中的数据
for index in range(1, pages+1):
# 定义url
url = "https://kaijiang.zhcw.com/zhcw/html/ssq/list_" + \
str(index) + ".html"
# 获取每一页的数据
every_page_result_list = every_page_result_list + \
get_html_str(get_url_html(url))
# 延时1秒
# time.sleep(3)
# 将数据写到指定路径下的csv文件中
write_excel(r"F:\PythonProject"+now_date+".csv", every_page_result_list)
end_time = datetime.datetime.now()
print(f"耗时总共{(end_time - start_time).seconds}秒")
整个代码