摘要:还在为找不到好音乐而烦恼吗?今天利用Scrapy来抓取网易云音乐评论数大于10w的好歌曲,让你的音乐列表鼓起来!
首先创建我们的scrapy项目,在之前也有写到过就不写了,然后配置settings.py文件,需要配置mongo的连接信息,我在这里把所有的歌手也存入了mongo中:
MONGODB_HOST = '127.0.0.1' MONGODB_PORT = 27017 MONGODB_DBNAME = 'music163' MONGODB_DOCNAME_GESHOU = 'singer' MONGODB_DOCNAME_MUSICS = 'musics'
items.py需要设置两个class类来存歌手和歌曲信息:
import scrapy class Music163Item(scrapy.Item): _id = scrapy.Field() name = scrapy.Field() url = scrapy.Field() movie = scrapy.Field() singer = scrapy.Field() album = scrapy.Field() album_url = scrapy.Field() comments = scrapy.Field() class Music163SingerItem(scrapy.Item): _id = scrapy.Field() singer = scrapy.Field() headimg = scrapy.Field() info_url = scrapy.Field()
pipelines.py中需要根据传入的item来判断存入哪个表:
import pymongo from scrapy.conf import settings from .items import Music163Item from .items import Music163SingerItem class Music163Pipeline(object): def __init__(self): host = settings['MONGODB_HOST'] port = settings['MONGODB_PORT'] db_name = settings['MONGODB_DBNAME'] self.client = pymongo.MongoClient(host=host, port=port) self.tdb = self.client[db_name] self.post = self.tdb[settings['MONGODB_DOCNAME_MUSICS']] def process_item(self, item, spider): '''先判断itme类型,在放入相应数据库''' if isinstance(item, Music163Item): try: music_info = dict(item) # if self.post.insert(music_info): print('Music Successful!') except Exception: pass if isinstance(item, Music163SingerItem): try: singer_info = dict(item) self.post = self.tdb[settings['MONGODB_DOCNAME_GESHOU']] if self.post.insert(singer_info): print('Singer Successful!') except Exception: pass return item
爬虫主文件main.py:
import scrapy import requests from scrapy import Selector from ..items import Music163Item from ..items import Music163SingerItem class MusicSpider(scrapy.Spider): name = 'musicspider' allowed_domain = ['http://music.163.com'] start_urls = 'http://music.163.com/discover/artist/cat?id={gid}&initial={initial}' group_ids = (1001, 1002, 1003, 2001, 2002, 2003, 6001, 6002, 6003, 7001, 7002, 7003, 4001, 4002, 4003) referer = 'http://music.163.com' user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36' headers = {'User-Agent': user_agent, 'Referer': referer} def start_requests(self): for gid in self.group_ids: for i in range(65, 91): yield scrapy.Request(url=self.start_urls.format(gid=gid, initial=i), headers=self.headers, method='GET', callback=self.parse) def parse(self, response): lists = response.selector.xpath('//*[@id="m-artist-box"]/li') for info in lists: item = Music163SingerItem() try: item['singer'] = info.xpath('p/a[1]/text()').extract()[0] item['info_url'] = 'http://music.163.com' + info.xpath('p/a[1]/@href').extract()[0].lstrip() item['headimg'] = info.xpath('div/img/@src').extract()[0] except Exception: item['singer'] = info.xpath('a[1]/text()').extract()[0] item['info_url'] = 'http://music.163.com' + info.xpath('a[1]/@href').extract()[0] yield scrapy.Request(url=item['info_url'], headers=self.headers, method='GET', callback=self.singer_parse) # yield item def singer_parse(self, response): lists = response.selector.xpath('//ul[@class="f-hide"]/li') for music in lists: music_url = 'http://music.163.com' + music.xpath('a/@href').extract()[0] yield scrapy.Request(url=music_url, headers=self.headers, method='GET', callback=self.music_parse) def music_parse(self, response): item = Music163Item() comments = response.selector.xpath('//span[@class="sub s-fc3"]/span/text()').extract()[0] if comments > 100000: try: item['name'] = response.selector.xpath('//em[@class="f-ff2"]/text()').extract()[0] item['url'] = 'http://music.163.com/song?id=' + response.selector.xpath('//div[@id="content-operation"]/@data-rid').extract()[0] item['movie'] = 'http://music.163.com' + response.selector.xpath('//div[@class="tit"]/a/@href').extract()[0] item['singer'] = response.selector.xpath('//div[@class="cnt"]/p[1]/span/a/text()').extract()[0] item['album'] = response.selector.xpath('//div[@class="cnt"]/p[2]/a/text()').extract()[0] item['album_url'] = response.selector.xpath('//div[@class="cnt"]/p[2]/a/@href').extract()[0] item['comments'] = comments except Exception: item['name'] = response.selector.xpath('//em[@class="f-ff2"]/text()').extract()[0] item['url'] = 'http://music.163.com/#/song?id=' + response.selector.xpath('//div[@id="content-operation"]/@data-rid').extract()[0] item['singer'] = response.selector.xpath('//div[@class="cnt"]/p[1]/span/a/text()').extract()[0] item['album'] = response.selector.xpath('//div[@class="cnt"]/p[2]/a/text()').extract()[0] item['album_url'] = response.selector.xpath('//div[@class="cnt"]/p[2]/a/@href').extract()[0] item['comments'] = comments yield item
原创文章转载请注明出处。