github 上找了个项目但是太老了用不了, 搜了一下也没有找到, 现在有什么可用的办法吗?
1
wdssmq 2021-07-21 09:40:19 +08:00
所以导出到哪里?需要全文抓取么?
|
2
woshichuanqilz OP 自己写了一个, 那个 60 那里是最大页数酌情修改
from selenium import webdriver import datetime import pandas as pd import time options = webdriver.ChromeOptions() options.add_argument('--ignore-ssl-errors=yes') options.add_argument('--ignore-certificate-errors') options.add_argument("--disable-blink-features=AutomationControlled") driver = webdriver.Chrome(options=options) xpath_items = "//*[@class='CollectionDetailPageItem-innerContainer']" xpath_title = ".//h2" xpath_readAll = ".//button[text()='阅读全文']" xpath_content = ".//div[@class='RichContent']" xpath_content = ".//div[@class='RichContent-inner']" xpath_next_page = "//button[text()='下一页']" result_list = list() count = 0 for i in range(1, 60): driver.get("https://www.zhihu.com/collection/{收藏夹编号}?page={}".format(i)) time.sleep(3) elems = driver.find_elements_by_xpath(xpath_items) for item in elems: count += 1 try: title = item.find_element_by_xpath(xpath_title).text item.find_element_by_xpath(xpath_readAll).click() content = item.find_element_by_xpath(xpath_content).get_attribute('outerHTML') result_list.append({'title': title, 'content': content}) except: pass df = pd.DataFrame(result_list) df.to_excel('res.xlsx', index=False) df = pd.DataFrame(result_list) df.to_excel('res.xlsx', index=False) input("please wait...") |