爬虫抓取自己csdn博客点赞数

查看自己文章的点赞数、浏览量。。。

# 翻页,获取全部文章链接
import requests, re, math
url = 'https://me.csdn.net/yellow_python'
r = requests.get(url, headers={'User-Agent': 'Opera/8.0 (Windows NT 5.1; U; en)'}).text
articles = re.search('<span>(\d+)</span>\s+<a href="https://blog.csdn.net/yellow_python\?t=1" target="_blank"><strong>原创</strong></a>', r).group(1)
pages = int(math.ceil(int(articles) / 20))
article_urls = []
for page in range(1, pages + 1):
    page_url = 'https://blog.csdn.net/Yellow_python/article/list/%d' % page
    rp = requests.get(page_url, headers={'User-Agent': 'Opera/8.0 (Windows NT 5.1; U; en)'}).text
    article_urls.extend(re.findall('<h4 class="">\s+<a href="(https://blog.csdn.net/Yellow_python/article/details/\d+)" target="_blank">', rp))
print(len(article_urls), article_urls)
# 解析文章
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options  # 火狐浏览器设置
firefox_option = Options()
firefox_option.set_headless()  # 设置浏览器为【无头】
driver = webdriver.Firefox(firefox_options=firefox_option)
wait = WebDriverWait(driver, 9)  # 显式等待,设置timeout
for article_url in article_urls:
    driver.get(article_url)
    title = driver.find_element_by_css_selector('html body div#mainBox.container.clearfix main div.blog-content-box div.article-header-box div.article-header div.article-title-box h1.title-article').text
    approval = driver.find_element_by_css_selector('.long-height > p:nth-child(4)').text
    pv = driver.find_element_by_css_selector('.read-count').text
    print(approval, pv, title, article_url, sep=' | ')
driver.close()

注意:程序运行后,要清理关闭失败的无头浏览器

已标记关键词 清除标记
相关推荐
©️2020 CSDN 皮肤主题: 技术黑板 设计师:CSDN官方博客 返回首页
实付 99.90元
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值