本文实例讲述了Python爬虫实现的根据分类爬取豆瓣电影信息功能。分享给大家供大家参考,具体如下:
代码的入口:
if __name__ == '__main__': main()
#! /usr/bin/python3# -*- coding:utf-8 -*-# author:Sirius.Zhaoimport jsonfrom urllib.parse import quotefrom urllib.request import urlopenfrom urllib.request import Requestimport pymysqlimport requestsfrom bs4 import BeautifulSoupimport sysimport datetimeimport timefrom imp import reloadimport randomdef LoadUserAgents(uafile): """ uafile : string path to text file of user agents, one per line """ uas = [] with open(uafile, 'rb') as uaf: for ua in uaf.readlines(): if ua: uas.append(ua.strip()[1:-1 - 1]) random.shuffle(uas) return uasuas = LoadUserAgents("user_agents.txt")# s = {}# for i in range(3):# s["key"] = [1,i,]# print(s)# print(s)#所有的电影,去重dict_movies = {}def datetime_to_timestamp_in_milliseconds(d): def current_milli_time(): return int(round(time.time() * 1000)) return current_milli_time()reload(sys)# 通过下面的网址获取分类列表# https://movie.douban.com/chart# 根据分类和比例获取相应的电影# https://movie.douban.com/typerank?type_name=%E5%96%9C%E5%89%A7&type=24&interval_id=100:90#定义一个比例的列表percent_list = ['100:90','90:80','80:70','70:60','60:50','50:40','40:30','30:20','20:10','10:0']#获取分类列表def find_iterm(url): response = urlopen(url) bs = BeautifulSoup(response,'html.parser') iterms = bs.select('div.types span a') iterms_href = [iterm.get('href') for iterm in iterms] iterms_list = [iterm.text for iterm in iterms] lists = [iterms_list,iterms_href] return lists# Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8# Accept-Encoding:gzip, deflate, br# Accept-Language:zh-CN,zh;q=0.9# Connection:keep-alive# Cookie:bid=mMrd75oQWFA; __utmc=30149280; __utmc=223695111; __yadk_uid=TsnvvnzAl9l5hXsJExLg5PkZQD8tW2xu; ll="108288"; _vwo_uuid_v2=DA5ED1377260F937BEC8CBD3785E44E53|98ebf520a520de4c9c6b9bed6d211cd7; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1522309082%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DR23_MHR8K3SFj2J4gH-0n2G67VhfRtaG8GFHstysqjnPZ_HxqpDmGX54pQSSCCCd%26wd%3D%26eqid%3Dde9da0fa00002a7f000000035abc9802%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.65574578.1521358273.1522244587.1522309083.7; __utmz=30149280.1522309083.7.7.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.505210566.1522198584.1522244587.1522309083.3; __utmb=223695111.0.10.1522309083; __utmz=223695111.1522309083.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; __utmb=30149280.1.10.1522309083; _pk_id.100001.4cf6=c6e6b98e6f177261.1522198584.3.1522309214.1522248302.# Host:movie.douban.com# Referer:https://movie.douban.com/chart# Upgrade-Insecure-Requests:1# User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36#获取某个阶段总的电影数目(100:90....)def find_total_num(suffix,head): link_total = "https://movie.douban.com/j/chart/top_list_count?type="+suffix print(link_total) req = Request(link_total, headers=head) total_data = urlopen(req) total_num = json.load(total_data)['total'] return total_numdef insert_into_mysql(dict_movies): con = pymysql.connect(host="localhost", user="root", password="root", database="douban", charset='utf8', port=3306) cursor = con.cursor() print(dict_movies) sql_insert = "INSERT INTO `douban`.`movies` (`rating`, `title`, `release_date`, `regions`, `types`, `actors`, `vote_count`, `score`, `rank`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)" for key,record in dict_movies.items(): cursor.execute(sql_insert, record) con.commit() cursor.close() con.close()#获取电影# 获取totalNum# https://movie.douban.com/j/chart/top_list_count?type=24&interval_id=100:90# {"playable_count":232,"total":431,"unwatched_count":431}# 获取电影信息# https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&start=0&limit=562def find_allMovie_iterm(category,href): # print(category, href) ua = random.choice(uas) head = { 'User-Agent': ua, 'Cookie': 'bid=mMrd75oQWFA; __utmc=30149280; __utmc=223695111; __yadk_uid=TsnvvnzAl9l5hXsJExLg5PkZQD8tW2xu; ll="108288"; _vwo_uuid_v2=DA5ED1377260F937BEC8CBD3785E44E53|98ebf520a520de4c9c6b9bed6d211cd7; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1522309082%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DR23_MHR8K3SFj2J4gH-0n2G67VhfRtaG8GFHstysqjnPZ_HxqpDmGX54pQSSCCCd%26wd%3D%26eqid%3Dde9da0fa00002a7f000000035abc9802%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.65574578.1521358273.1522244587.1522309083.7; __utmz=30149280.1522309083.7.7.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.505210566.1522198584.1522244587.1522309083.3; __utmb=223695111.0.10.1522309083; __utmz=223695111.1522309083.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; __utmb=30149280.1.10.1522309083; _pk_id.100001.4cf6=c6e6b98e6f177261.1522198584.3.1522309214.1522248302.', 'Referer': 'https://movie.douban.com/chart' } # /typerank?type_name=%E5%96%9C%E5%89%A7&type=24&interval_id= suffix = href.split('&type=')[1] link_movies = "https://movie.douban.com/j/chart/top_list?type="+ suffix # 获取数据 # { # "rating":["8.2","45"], # "rank":391, # "types":["喜剧","犯罪","爱情"], # "regions":["美国"], # "title":"天堂里的烦恼", # "release_date":"1932-10-21", # "vote_count":1868, # "score":"8.2", # "actors":["米利亚姆·霍普金斯","凯·弗朗西斯","赫伯特·马歇尔","查尔斯·拉格尔斯","爱德华·艾沃瑞特·霍顿"], # }, #去重使用 for stage in percent_list: time.sleep(2) suffix_total = suffix + stage total = find_total_num(suffix_total,head) url_movies = link_movies + stage + '&start=0&limit=' + str(total) # 解析每次获取的json串,形成record 考虑去重 req = Request(url_movies, headers=head) movies_list = urlopen(req) # 得到一个[{},{},{}]类型的json串 movies_list = json.load(movies_list) print(movies_list) for movie in movies_list: rating = str(int(movie['rating'][1]) / 10) title = movie['title'] release_date = movie['release_date'] regions = ','.join(movie['regions']) types = ','.join(movie['types']) actors = ','.join(movie['actors']) vote_count = movie['vote_count'] score = movie['score'] rank = movie['rank'] dict_movies[title+release_date] = [rating,title,release_date,regions,types,actors,vote_count,score,rank]def main(): url = 'https://movie.douban.com/chart' #获取分类列表 iterms = find_iterm(url) # 获取每个分类的所有的电影 传递:分类,分类href for i in range(len(iterms[1])): find_allMovie_iterm(iterms[0][i],iterms[1][i].split('100:90')[0]) # 将一个分类的电影插入到数据库中 insert_into_mysql(dict_movies)if __name__ == '__main__': main()