爬取帖子|python爬虫|python入门|python教程

当前位置:

首页 > 编程开发 > python爬虫 >

python爬虫之爬取帖子

本站最新发布 Python从入门到精通|Python基础教程
试听地址 https://www.xin3721.com/eschool/pythonxin3721/

百度贴吧、爬取帖子的标题、发布时间和链接

import threading
import requests
import re
import os

#   百度贴吧        爬取帖子的标题、发布时间和链接

#   要搜索的贴吧名称
word = '文字控吧'
#   设置爬取页数
num = 5


# 获取详情页url和标题
def parse(word, pn):
   r = requests.get('https://tieba.baidu.com/f', params={'kw': word, 'pn': pn}).content.decode()
   article_urls = re.findall(r'<a rel="noreferrer" href="(/p/\d+)" title="(.*?)" target=', r, re.S)
   print('正在请求中...')
   return article_urls


#   发起请求
def parse_detail(article_urls):
   for article_url in article_urls:
       article_req = requests.get('https://tieba.baidu.com' + article_url[0]).text
       if not re.findall(r'"userName":"(.*?)"', article_req, re.S):
           print('未匹配到数据，这个正则不符合这个贴吧，需要重写正则')
           continue
       #   楼主
       author = re.findall(r'"userName":"(.*?)"', article_req, re.S)[0]
       #   发帖时间
       crete_time = \
           re.findall(r'<span class="tail-info">1楼</span><span class="tail-info">(.*?)</span>', article_req, re.S)[0]
       if author and crete_time and crete_time:
           content = '楼主：{}\n标题：{}\n发布时间：{}\n链接：{}\n'.format(author, article_url[1], crete_time,
                                                             'https://tieba.baidu.com' + article_url[0])
           print(content)
           #   写入文件
           with open(word + '.txt', 'a')as f:
               f.write('{}\n'.format(content))


if not os.path.exists('百度贴吧'):
   #   创建文件夹
   os.mkdir('百度贴吧')
os.chdir('百度贴吧')

t_list = []
for pn in range(0, num * 50, 50):
   #   先获取详情页url和标题
   article_urls = parse(word, pn)
   #   对每一个详情页进行请求
   t = threading.Thread(target=parse_detail, args=(article_urls,))
   t_list.append(t)

# 启动线程
for t in t_list:
   t.start()
# 等待所有线程结束
for t in t_list:
   t.join()

栏目列表

首页 > 编程开发 > python爬虫 >

python爬虫之爬取帖子