VB.net 2010 视频教程 VB.net 2010 视频教程 python基础视频教程
SQL Server 2008 视频教程 c#入门经典教程 Visual Basic从门到精通视频教程
当前位置:
首页 > Python基础教程 >
  • 用python实现的抓取腾讯视频所有电影的爬虫

用python实现的抓取腾讯视频所有电影的爬虫(文章不错,所以进行了转载)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# -*- coding: utf-8 -*-
import re
import urllib2
from bs4import BeautifulSoup
import string, time
import pymongo
 
NUM     =0         #全局变量,电影数量
m_type  = u''       #全局变量,电影类型
m_site  = u'qq' #全局变量,电影网站
 
#根据指定的URL获取网页内容
def gethtml(url):
    req = urllib2.Request(url)
    response = urllib2.urlopen(req)
    html = response.read()
    return html
 
#从电影分类列表页面获取电影分类
def gettags(html):
    global m_type
    soup = BeautifulSoup(html)      #过滤出分类内容
    #print soup
    #<ulclass="clearfix _group" gname="mi_type" gtype="1">
    tags_all = soup.find_all('ul', {'class' :'clearfix _group' ,'gname' :'mi_type'})
    #print len(tags_all), tags_all
    #print str(tags_all[1]).replace('\n','')
 
    #<a _hot="tag.sub" class="_gtag _hotkey" href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html" title="动作" tvalue="0">动作</a>
    re_tags = r'<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>'
    p = re.compile(re_tags, re.DOTALL)
 
    tags = p.findall(str(tags_all[0]))
    if tags:
        tags_url = {}
        #print tags
        for tagin tags:
            tag_url = tag[0].decode('utf-8')
            #print tag_url
            m_type = tag[1].decode('utf-8')
            tags_url[m_type] = tag_url
             
    else:
            print"Not Find"
    return tags_url
 
#获取每个分类的页数
def get_pages(tag_url):
    tag_html = gethtml(tag_url)
    #divclass="paginator
    soup = BeautifulSoup(tag_html)      #过滤出标记页面的html
    #print soup
    #<divclass="mod_pagenav" id="pager">
    div_page = soup.find_all('div', {'class' :'mod_pagenav','id' :'pager'})
    #print div_page #len(div_page), div_page[0]
 
    #<aclass="c_txt6" href="http://v.qq.com/list/1_2_-1_-1_1_0_24_20_0_-1_0.html" title="25"><span>25</span></a>
    re_pages = r'<a class=.+?><span>(.+?)</span></a>'
    p = re.compile(re_pages, re.DOTALL)
    pages = p.findall(str(div_page[0]))
    #print pages
    if len(pages) >1:
        return pages[-2]
    else:
        return 1
     
 
def getmovielist(html):
    soup = BeautifulSoup(html)
 
    #<ulclass="mod_list_pic_130">
    divs = soup.find_all('ul', {'class' :'mod_list_pic_130'})
    #print divs
    for div_htmlin divs:
        div_html = str(div_html).replace('\n','')
        #print div_html
        getmovie(div_html)
 
 
def getmovie(html):
    global NUM
    global m_type
    global m_site
 
    re_movie = r'<li><a class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\"><img.+?</li>'
    p = re.compile(re_movie, re.DOTALL)
    movies = p.findall(html)
    if movies:
        conn = pymongo.Connection('localhost',27017)
        movie_db = conn.dianying
        playlinks = movie_db.playlinks
        #print movies
        for moviein movies:
            #print movie
            NUM +=1
            print"%s : %d" % ("=" *70, NUM)
            values = dict(
                movie_title = movie[1],
                movie_url   = movie[0],
                movie_site      = m_site,
                movie_type      = m_type
                )
            print values
            playlinks.insert(values)
            print"_" *70
            NUM +=1
            print"%s : %d" % ("=" *70, NUM)
 
    #else:
    #   print"Not Find"
 
def getmovieinfo(url):
    html = gethtml(url)
    soup = BeautifulSoup(html)
 
    #pack pack_album album_cover
    divs = soup.find_all('div', {'class' :'pack pack_album album_cover'})
    #print divs[0]
 
    #<a href="http://www.tudou.com/albumplay/9NyofXc_lHI/32JqhiKJykI.html" target="new" title="《血滴子》独家纪录片" wl="1"> </a>
    re_info = r'<a href=\"(.+?)\" target=\"new\" title=\"(.+?)\" wl=\".+?\"> </a>'
    p_info = re.compile(re_info, re.DOTALL)
    m_info = p_info.findall(str(divs[0]))
    if m_info:
        return m_info
    else:
        print"Not find movie info"
 
    return m_info
 
 
def insertdb(movieinfo):
    global conn
    movie_db = conn.dianying_at
    movies = movie_db.movies
    movies.insert(movieinfo)
 
if __name__ =="__main__":
    global conn
 
    tags_url ="http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"
    #print tags_url
    tags_html = gethtml(tags_url)
    #print tags_html
    tag_urls = gettags(tags_html)
    #print tag_urls
 
 
    for urlin tag_urls.items():
        print  str(url[1]).encode('utf-8') #,url[0]
        maxpage =int(get_pages(str(url[1]).encode('utf-8')))
        print maxpage
 
        for xin range(0, maxpage):
            #http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html
            m_url = str(url[1]).replace('0_20_0_-1_0.html','')
            movie_url ="%s%d_20_0_-1_0.html" % (m_url, x)
            print movie_url
            movie_html = gethtml(movie_url.encode('utf-8'))
            #print movie_html
            getmovielist(movie_html)
            time.sleep(0.1)
 

相关教程