抓取网易云音乐评论数大于10w的歌曲信息存入mongo

Robin 4863次浏览

摘要:还在为找不到好音乐而烦恼吗?今天利用Scrapy来抓取网易云音乐评论数大于10w的好歌曲,让你的音乐列表鼓起来!

首先创建我们的scrapy项目,在之前也有写到过就不写了,然后配置settings.py文件,需要配置mongo的连接信息,我在这里把所有的歌手也存入了mongo中:

MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'music163'
MONGODB_DOCNAME_GESHOU = 'singer'
MONGODB_DOCNAME_MUSICS = 'musics'


items.py需要设置两个class类来存歌手和歌曲信息:

import scrapy

class Music163Item(scrapy.Item):
    _id = scrapy.Field()
    name = scrapy.Field()
    url = scrapy.Field()
    movie = scrapy.Field()
    singer = scrapy.Field()
    album = scrapy.Field()
    album_url = scrapy.Field()
    comments = scrapy.Field()

class Music163SingerItem(scrapy.Item):
    _id = scrapy.Field()
    singer = scrapy.Field()
    headimg = scrapy.Field()
    info_url = scrapy.Field()


pipelines.py中需要根据传入的item来判断存入哪个表:

import pymongo
from scrapy.conf import settings
from .items import Music163Item
from .items import Music163SingerItem

class Music163Pipeline(object):
    def __init__(self):
        host = settings['MONGODB_HOST']
        port = settings['MONGODB_PORT']
        db_name = settings['MONGODB_DBNAME']
        self.client = pymongo.MongoClient(host=host, port=port)
        self.tdb = self.client[db_name]
        self.post = self.tdb[settings['MONGODB_DOCNAME_MUSICS']]

    def process_item(self, item, spider):
        '''先判断itme类型,在放入相应数据库'''
        if isinstance(item, Music163Item):
            try:
                music_info = dict(item)  #
                if self.post.insert(music_info):
                    print('Music Successful!')
            except Exception:
                pass
        if isinstance(item, Music163SingerItem):
            try:
                singer_info = dict(item)
                self.post = self.tdb[settings['MONGODB_DOCNAME_GESHOU']]
                if self.post.insert(singer_info):
                    print('Singer Successful!')
            except Exception:
                pass
        return item


爬虫主文件main.py:

import scrapy
import requests
from scrapy import Selector
from ..items import Music163Item
from ..items import Music163SingerItem


class MusicSpider(scrapy.Spider):
    name = 'musicspider'
    allowed_domain = ['http://music.163.com']
    start_urls = 'http://music.163.com/discover/artist/cat?id={gid}&initial={initial}'
    group_ids = (1001, 1002, 1003, 2001, 2002, 2003, 6001, 6002, 6003, 7001, 7002, 7003, 4001, 4002, 4003)
    referer = 'http://music.163.com'
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'
    headers = {'User-Agent': user_agent, 'Referer': referer}

    def start_requests(self):
        for gid in self.group_ids:
            for i in range(65, 91):
                yield scrapy.Request(url=self.start_urls.format(gid=gid, initial=i), headers=self.headers, method='GET', callback=self.parse)

    def parse(self, response):
        lists = response.selector.xpath('//*[@id="m-artist-box"]/li')
        for info in lists:
            item = Music163SingerItem()
            try:
                item['singer'] = info.xpath('p/a[1]/text()').extract()[0]
                item['info_url'] = 'http://music.163.com' + info.xpath('p/a[1]/@href').extract()[0].lstrip()
                item['headimg'] = info.xpath('div/img/@src').extract()[0]
            except Exception:
                item['singer'] = info.xpath('a[1]/text()').extract()[0]
                item['info_url'] = 'http://music.163.com' + info.xpath('a[1]/@href').extract()[0]
            yield scrapy.Request(url=item['info_url'], headers=self.headers, method='GET', callback=self.singer_parse)
            # yield item

    def singer_parse(self, response):
        lists = response.selector.xpath('//ul[@class="f-hide"]/li')
        for music in lists:
            music_url = 'http://music.163.com' + music.xpath('a/@href').extract()[0]
            yield scrapy.Request(url=music_url, headers=self.headers, method='GET', callback=self.music_parse)

    def music_parse(self, response):
        item = Music163Item()
        comments = response.selector.xpath('//span[@class="sub s-fc3"]/span/text()').extract()[0]
        if comments > 100000:
            try:
                item['name'] = response.selector.xpath('//em[@class="f-ff2"]/text()').extract()[0]
                item['url'] = 'http://music.163.com/song?id=' + response.selector.xpath('//div[@id="content-operation"]/@data-rid').extract()[0]
                item['movie'] = 'http://music.163.com' + response.selector.xpath('//div[@class="tit"]/a/@href').extract()[0]
                item['singer'] = response.selector.xpath('//div[@class="cnt"]/p[1]/span/a/text()').extract()[0]
                item['album'] = response.selector.xpath('//div[@class="cnt"]/p[2]/a/text()').extract()[0]
                item['album_url'] = response.selector.xpath('//div[@class="cnt"]/p[2]/a/@href').extract()[0]
                item['comments'] = comments
            except Exception:
                item['name'] = response.selector.xpath('//em[@class="f-ff2"]/text()').extract()[0]
                item['url'] = 'http://music.163.com/#/song?id=' + response.selector.xpath('//div[@id="content-operation"]/@data-rid').extract()[0]
                item['singer'] = response.selector.xpath('//div[@class="cnt"]/p[1]/span/a/text()').extract()[0]
                item['album'] = response.selector.xpath('//div[@class="cnt"]/p[2]/a/text()').extract()[0]
                item['album_url'] = response.selector.xpath('//div[@class="cnt"]/p[2]/a/@href').extract()[0]
                item['comments'] = comments
            yield item


原创文章转载请注明出处。

相关文章