scrapy实战百度百科数据抓取

5150阅读 0评论2015-01-17 a741423206
分类:Python/Ruby

Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架。 可以应用在包括数据挖掘,信息处理或存储历史数据等一系列的程序中。

架构


scrapy中文手册 

1.安装SCRAPY



2.创建一个Scrapy项目

点击(此处)折叠或打开

  1. scrapy startproject tutorial
生成
tutorial/
    scrapy.cfg
    tutorial/
        __init__.py
        items.py
        pipelines.py
        settings.py
        spiders/
            __init__.py
            ...

3.新建抓取文件spiders/baike_spider.py,实现代码

点击(此处)折叠或打开

  1. # -*- coding: utf-8 -*-
  2. from scrapy.spider import BaseSpider
  3. from scrapy.http import Request
  4. from scrapy.selector import HtmlXPathSelector
  5. from tutorial.items import BaikeItem

  6. from scrapy.contrib.spiders import CrawlSpider, Rule
  7. from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

  8. i=0
  9. not_data = 0 #统计未采集到的条数
  10. class BaikeSpider(CrawlSpider):

  11.    name = "baike"
  12.    #减慢爬取速度 为1s
  13.    # download_delay = 1
  14.    allowed_domains = ['baike.baidu.com']
  15.    start_urls = [ 
  16.        "互联网"
  17.    ]

  18.    

  19.    rules =[
  20.         Rule(SgmlLinkExtractor(allow=('/?limit=30&index=([\w]+)&offset=([\d]+)\#gotoList', ),)),
  21.         Rule(SgmlLinkExtractor(allow=('/view/', ),
  22.           restrict_xpaths=('//div[@class="list"]')),
  23.           callback='parse_item',
  24.           ),
  25.    ]

  26.    def parse_item(self, response):
  27.     global i,not_data
  28.     i+= 1 #记录抓取条数
  29.     print i
  30.     item = BaikeItem()
  31.     sel = HtmlXPathSelector(response)
  32.     baike_url = str(response.url)
  33.     baike_name = sel.xpath('//div[@id="sec-content0"]/h1/span[@class="lemmaTitleH1"]/text()').extract()
  34.     baike_desc = sel.xpath('//div[@class="card-summary-content"]/div[@class="para"]/text()').extract()[0]


  35.     if not baike_name:
  36.       not_data+=1 #记录未抓取到的条数
  37.       print not_data


  38.     if not baike_desc:
  39.       baike_desc = '未抓取到'
  40.       

  41.     item['title'] = [n.encode('utf-8') for n in baike_name]
  42.     item['link'] = baike_url.encode('utf-8')
  43.     item['desc'] = baike_desc
  44.    
  45.     yield item


4.编写item

点击(此处)折叠或打开

  1. class CsdnItem(scrapy.Item):
  2.     blog_name = scrapy.Field()
  3.     blog_url = scrapy.Field()

5.保存数据到mysql


点击(此处)折叠或打开

  1. # -*- coding: utf-8 -*-

  2. # Define your item pipelines here
  3. #保存文件
  4. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  5. # See:

  6. from scrapy import log 
  7. from twisted.enterprise import adbapi
  8. from scrapy.http import Request
  9. from scrapy.exceptions import DropItem 
  10. import time
  11. import MySQLdb
  12. import MySQLdb.cursors
  13. import json
  14. import codecs

  15. class TutorialPipeline(object):

  16.     def __init__(self):
  17.       self.dbpool = adbapi.ConnectionPool('MySQLdb',
  18.                 db = 'test',
  19.                 user = 'root',
  20.                 passwd = '',
  21.                 cursorclass = MySQLdb.cursors.DictCursor,
  22.                 charset = 'utf8',
  23.                 use_unicode = False
  24.         )
  25.       

  26.     def process_item(self, item, spider):
  27.         query = self.dbpool.runInteraction(self._conditional_insert, item)
  28.         query.addErrback(self.handle_error)


  29.     def _conditional_insert(self, tx, item):
  30.         if item.get('title'):
  31.             tx.execute(\
  32.                 "insert into test(id,title, link, descc ) \
  33.                  values (null,%s,%s,%s)",
  34.                  (item['title'],item['link'],item['desc'])
  35.             )
  36.     

6.执行代码



7.效果






上一篇:o2o模式发展战略规划
下一篇:Unix/Linux/mac 内核系统 Sublime 2、3 配置php code sniffer插件