博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Python网络爬虫(七) 深度爬虫CrawlSpider
阅读量:6915 次
发布时间:2019-06-27

本文共 21215 字,大约阅读时间需要 70 分钟。

###目录:

深度爬虫之前推荐一个简单实用的库fake-useragent,可以伪装生成headers请求头中的User Agent值

#安装pip install  fake-useragent复制代码
#使用import requestsfrom fake_useragent import UserAgentua = UserAgent()headers = {
'User-Agent': ua.random}url = '待爬网页的url'resp = requests.get(url, headers=headers)复制代码

#1.深度爬虫CrawlSpider

scrapy.spiders.CrawlSpider 创建项目:scrapy startproct 
创建爬虫:scrapy genspider –t crawl
核心处理规则: from scrapy.spiders import CrawlSpider, Rule 核心处理提取: from scrapy.linkextractors import LinkExtractor复制代码
  • rules:该属性为一个正则表达式集合,用于告知爬虫需要跟踪哪些链接

  • rules属性还有一个callback函数,用于解析下载得到的响应,而parse_item()方法给我们提供了一个从响应中获取数据的例子。

  • 使用shell命令抓取:scrapy shell http://baidu.com #2.链接提取:LinkExtractor

class scrapy.contrib.linkextractor.sgml.SgmlLinkExtractor(    allow = (),			# 符合正则表达式参数的数据会被提取    deny = (),			# 符合正则表达式参数的数据禁止提取    allow_domains = (),		# 包含的域名中可以提取数据    deny_domains = (),		# 包含的域名中禁止提取数据    deny_extensions = (),		    restrict_xpath = (),		# 使用xpath提取数据,和allow共同起作用    tags = (),			# 根据标签名称提取数据    attrs = (),			# 根据标签属性提取数据    canonicalize = (),    unique = True,			# 剔除重复链接请求    process_value = None)复制代码

#3.爬取规则:rules

rules = [    Rule(        link_extractor,		# LinkExtractor对象        callback=None,		# 请求到响应数据时的回调函数        cb_kwargs=None,		# 调用函数设置的参数,不要指定为parse        follow=None,		# 是否从response跟进链接,为布尔值        process_links=None,	# 过滤linkextractor列表,每次获取列表时都会调用        process_request=None	# 过滤request,每次提取request都会调用    )] 复制代码

#4.如何在pycharm中直接运行爬虫 1. 在项目下创建start.py文件

# -*- coding:utf-8 -*-from scrapy import cmdline  #引入命令行cmdline.execute('scrapy crawl dang'.split())复制代码

2. 如图所示

####配置了这么多最后发现start.py后直接运行就行,不需要配置那么多。

#5.使用CrawlSpider爬取猎聘网python相关岗位招聘信息

  • 创建项目
scrapy startproject liep复制代码
  • 自动创建spiders文件
scrapy genspider lp liepin.com复制代码
  • items.py
# -*- coding: utf-8 -*-import scrapyclass LiepItem(scrapy.Item):    name = scrapy.Field()    company = scrapy.Field()    salary = scrapy.Field()    address = scrapy.Field()    #投递时间反馈    experience = scrapy.Field()复制代码
  • pipelines.py
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport jsonclass LiepPipeline(object):    def __init__(self):        self.file = open('liepin.json','w')    def process_item(self, item, spider):        text = json.dumps(dict(item),ensure_ascii=False)        self.file.write(text.encode('utf-8'))        print 'QAQ ----> 正在写入数据'    def close(self):        self.file.close()复制代码
  • lp.py
# -*- coding: utf-8 -*-from scrapy.spiders import CrawlSpider,Rulefrom scrapy.linkextractors import LinkExtractorfrom liep.items import LiepItemimport reclass LpSpider(CrawlSpider):    reg = re.compile('\s*')    name = 'lp'    allowed_domains = ['www.liepin.com']    start_urls = ['https://www.liepin.com/zhaopin/?pubTime=&ckid=6f6956c5d999c17e&fromSearchBtn=2&compkind=&isAnalysis=&init=-1&searchType=1&dqs=020&industryType=&jobKind=&sortFlag=15&degradeFlag=0&industries=040&salary=0%240&compscale=&key=python&clean_condition=&headckid=7a006343bdb04f47&curPage=0',]    #定义提取超链接的提取规则    page_link = LinkExtractor(allow=('&curPage=\d+'))    #定义爬取数据的规则    rules = {        Rule(page_link,callback='parse_content',follow=True)    }    #定义处理函数    def parse_content(self, response):        #定义一个Item,用于存储数据        item = LiepItem()        #获取整个我们需要的数据区域        job_list = response.xpath('//div[@class="job-info"]')        for job in job_list:            name = job.xpath('.//h3/a')            item['name'] = self.reg.sub('', name.xpath('string(.)').extract()[0])            item['company'] = job.xpath('..//p[@class="company-name"]/a/text()').extract()            item['salary'] = job.xpath('.//span[@class="text-warning"]/text()').extract()            item['address'] = job.xpath('.//p[@class="condition clearfix"]//a/text()').extract()            item['experience'] = job.xpath('.//p[@class="condition clearfix"]//span[3]/text()').extract()            yield item复制代码
  • settings.py
DEFAULT_REQUEST_HEADERS = {  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',  'Accept-Language': 'en',  'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36',}#把ITEM_PIPELINES的注释取消ITEM_PIPELINES = {   'firPro.pipelines.FirproPipeline': 300,}复制代码
  • 爬取的结果liepin.json
{  "salary": "12-24万",  "company": "嗨皮(上海)网络科技股份有限公司",  "name": "python开发工程师",  "experience": "3年工作经验",  "address": "上海"}{  "salary": "14-28万",  "company": "第一弹",  "name": "Python后端开发",  "experience": "3年工作经验",  "address": "上海"}{  "salary": "12-18万",  "company": "易路软件",  "name": "Python中级开发工程师",  "experience": "3年工作经验",  "address": "上海-闵行区"}{  "salary": "11-21万",  "company": "信用飞/首付游",  "name": "Python开发工程师(风控方向)",  "experience": "1年工作经验",  "address": "上海-徐汇区"}{  "salary": "13-24万",  "company": "联车科技",  "name": "python开发",  "experience": "3年工作经验",  "address": "上海"}{  "salary": "12-24万",  "company": "寻仟信息",  "name": "Python开发工程师",  "experience": "1年工作经验",  "address": "上海"}{  "salary": "12-22万",  "company": "ifuwo",  "name": "Python开发工程师",  "experience": "1年工作经验",  "address": "上海-浦东新区"}{  "salary": "12-24万",  "company": "小葫芦",  "name": "python开发工程师",  "experience": "1年工作经验",  "address": "上海"}{  "salary": "14-24万",  "company": "ifuwo",  "name": "python后台工程师",  "experience": "2年工作经验",  "address": "上海-浦东新区"}{  "salary": "面议",  "company": "森浦资讯",  "name": "Python开发工程师",  "experience": "2年工作经验",  "address": "上海"}{  "salary": "14-24万",  "company": "优刻得",  "name": "OPL-python运维开发",  "experience": "2年工作经验",  "address": "上海"}{  "salary": "面议",  "company": "上海聪牛金融信息服务有限公司",  "name": "python开发工程师",  "experience": "2年工作经验",  "address": "上海"}{  "salary": "12-30万",  "company": "进馨网络",  "name": "python开发工程师",  "experience": "3年工作经验",  "address": "上海"}{  "salary": "12-18万",  "company": "载信软件",  "name": "Python工程师",  "experience": "1年工作经验",  "address": "上海"}{  "salary": "14-24万",  "company": "优刻得",  "name": "OPL-python运维开发J10605",  "experience": "1年工作经验",  "address": "上海"}{  "salary": "10-24万",  "company": "上海霄骋信息科技有限公司",  "name": "Python爬虫开发工程师",  "experience": "2年工作经验",  "address": "上海"}{  "salary": "面议",  "company": "五五海淘",  "name": "Python",  "experience": "1年工作经验",  "address": "上海"}..................................复制代码

#6.使用中间件设置请求头和代理

  • settings.py
# -*- coding: utf-8 -*-BOT_NAME = 'tea'SPIDER_MODULES = ['tea.spiders']NEWSPIDER_MODULE = 'tea.spiders'# 用于设置日志配置文件,将程序运行的信息,保存在指定的文件中LOG_FILE = 's.log'# 用于设置信息记录级别 DEBUG最高级别~记录所有信息  --  INFO WARNING...# 详细日志
-> 摘要信息
-> 警告信息
-> 错误信息
....LOG_LEVEL = 'INFO'# Crawl responsibly by identifying yourself (and your website) on the user-agent#USER_AGENT = 'tea (+http://www.yourdomain.com)'USER_AGENTS = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60", "Opera/8.0 (Windows NT 5.1; U; en)", "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) ", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) ", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) " ]# Obey robots.txt rulesROBOTSTXT_OBEY = True# Enable or disable downloader middlewares# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.htmlDOWNLOADER_MIDDLEWARES = { # 'tea.middlewares.MyCustomDownloaderMiddleware': 543, 'tea.middlewares.UseragentMiddleware': 543, 'tea.middlewares.ProxyMiddleware':600,}PROXY = [ {
"ip_port":"178.62.47.236:80"}, {
"ip_port":"125.77.25.116:80"}, {
"ip_port":"13.58.249.76:8080"}, {
"ip_port":"37.204.253.2:8081"}, {
"ip_port":"78.47.174.243:3128"}, {
"ip_port":"139.59.235.243:3128", "user_password":"admin:123123"}]复制代码
  • middlewares.py
# -*- coding: utf-8 -*-# Define here the models for your spider middleware## See documentation in:# http://doc.scrapy.org/en/latest/topics/spider-middleware.htmlimport randomimport base64from settings import USER_AGENTS,PROXY#创建一个自定义的下载中间件 -- 需要在settings.py中进行配置才能起作用class UseragentMiddleware(object):    #定义一个专门用于处理请求的函数:两个参数,第一个参数就是要处理的请求对象,第二个参数是爬虫程序    #该函数必须返回一个数据-None/request,如果返回的是None,表示处理完成,交给后续的中间件继续操作    #如果返回的是request,此时返回的request会被重新交给引擎添加到请求队列中,重新发起    def process_request(self,request,spider):        print ('----QAQ-----')        #随机获取一个user-Agent        useragent = random.choice(USER_AGENTS)        #给request请求头中添加user-agent配置        request.headers.setdefault('User-agent',useragent)        print ('---->headers successful')        return Noneclass ProxyMiddleware(object):    def process_request(self,request,spider):        print ('------->-_-')        proxy = random.choice(PROXY)        # 给request请求中添加Proxy配置        print proxy['ip_port'],proxy.get('user_password',None)        request.meta['proxy'] = proxy.get('ip_port')        #验证        if proxy.get('user_password',None):            b64 = base64.b64encode(proxy.get('user_password'))            print b64            request.headers['Proxy-Authorization'] = 'Basic '+b64            print '======proxy======'复制代码

#7.爬取美西网商品详情,并存储与数据库 代码如下:

  • items.py
# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass MeiciItem(scrapy.Item):    # define the fields for your item here like:    # name = scrapy.Field()    passclass JsArticleItem(scrapy.Item):    brand = scrapy.Field()    productitle = scrapy.Field()    price = scrapy.Field()    color = scrapy.Field()    szie = scrapy.Field()    proimg = scrapy.Field()    prodata = scrapy.Field()    brandstory = scrapy.Field()    brandimg = scrapy.Field()    meiciid = scrapy.Field()复制代码
  • middlewares.py
# -*- coding: utf-8 -*-# Define here the models for your spider middleware## See documentation in:# http://doc.scrapy.org/en/latest/topics/spider-middleware.htmlfrom scrapy import signalsclass MeiciSpiderMiddleware(object):    # Not all methods need to be defined. If a method is not defined,    # scrapy acts as if the spider middleware does not modify the    # passed objects.    @classmethod    def from_crawler(cls, crawler):        # This method is used by Scrapy to create your spiders.        s = cls()        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)        return s    def process_spider_input(self, response, spider):        # Called for each response that goes through the spider        # middleware and into the spider.        # Should return None or raise an exception.        return None    def process_spider_output(self, response, result, spider):        # Called with the results returned from the Spider, after        # it has processed the response.        # Must return an iterable of Request, dict or Item objects.        for i in result:            yield i    def process_spider_exception(self, response, exception, spider):        # Called when a spider or process_spider_input() method        # (from other spider middleware) raises an exception.        # Should return either None or an iterable of Response, dict        # or Item objects.        pass    def process_start_requests(self, start_requests, spider):        # Called with the start requests of the spider, and works        # similarly to the process_spider_output() method, except        # that it doesn’t have a response associated.        # Must return only requests (not items).        for r in start_requests:            yield r    def spider_opened(self, spider):        spider.logger.info('Spider opened: %s' % spider.name)复制代码
  • pipelines.py
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport jsonimport pymysqlimport settingsclass MeiciPipeline(object):    def process_item(self, item, spider):        return itemclass WebcrawlerScrapyPipeline(object):    def __init__(self):        self.connect = pymysql.connect(            host=settings.MYSQL_HOST,            db=settings.MYSQL_DBNAME,            user=settings.MYSQL_USER,            passwd=settings.MYSQL_PASSWD,            charset='utf8',            use_unicode=True)        self.cursor = self.connect.cursor()    def process_item(self, item, spider):        print item['meiciid']        self.cursor.execute("""select meiciid from goods where meiciid = %s;""", item['meiciid'])        ret = self.cursor.fetchone()        if not ret:            self.cursor.execute(                """insert into goods(brand,productitle,price,color,                szie,proimg,prodata,brandstory,brandimg,meiciid)                 values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);""",                (item['brand'],                 item['productitle'],                 item['price'],                 item['color'],                 item['szie'],                 item['proimg'],                 item['prodata'],                 item['brandstory'],                 item['brandimg'],                 item['meiciid']))            self.connect.commit()            print "商品保存成功"        else:            pass复制代码
  • settings.py
# -*- coding: utf-8 -*-# Scrapy settings for meici project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:##     http://doc.scrapy.org/en/latest/topics/settings.html#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'meici'SPIDER_MODULES = ['meici.spiders']NEWSPIDER_MODULE = 'meici.spiders'#Mysql数据库的配置信息MYSQL_HOST = '127.0.0.1'MYSQL_DBNAME = 'meici'         #数据库名字,请修改MYSQL_USER = 'root'             #数据库账号,请修改MYSQL_PASSWD = '960226'         #数据库密码,请修改MYSQL_PORT = 3306               #数据库端口,在dbhelper中使用# Crawl responsibly by identifying yourself (and your website) on the user-agent#USER_AGENT = 'meici (+http://www.yourdomain.com)'# Obey robots.txt rulesROBOTSTXT_OBEY = True# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docsDOWNLOAD_DELAY = 1# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:DEFAULT_REQUEST_HEADERS = {  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',  'Accept-Language': 'en','User-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',}# Enable or disable spider middlewares# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {
# 'meici.middlewares.MeiciSpiderMiddleware': 543,#}# Enable or disable downloader middlewares# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html# DOWNLOADER_MIDDLEWARES = {
# 'meici.middlewares.MyCustomDownloaderMiddleware': 543,# }DOWNLOADER_MIDDLEWARES = {'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,}# Enable or disable extensions# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,#}# Configure item pipelines# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = { 'meici.pipelines.WebcrawlerScrapyPipeline': 300, # 保存到mysql数据库 'meici.pipelines.MeiciPipeline': 300,}# Enable and configure the AutoThrottle extension (disabled by default)# See http://doc.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'复制代码
  • meicispider.py
# -*- coding: utf-8 -*-import scrapyimport reimport jsonfrom meici import itemsfrom scrapy.spiders import CrawlSpider, Rulefrom scrapy.linkextractors import LinkExtractorclass Meicispider(CrawlSpider):    name = 'meici'    allowed_domains = ['meici.com']    start_urls = ['http://www.meici.com/product/detail/id/300251/saleid/156692.html',                  ]    rules = (        Rule(LinkExtractor(allow=('/product/\w+/id/\d+/saleid/\d+.html')),callback='parse_item',follow=True),    )    def parse_item(self, response):        reg = re.compile('\s*')        xml = response        brand = xml.xpath('//*[@id="content"]/div/div[1]/div[2]/h1/a/text()').extract()[0]        productitle = xml.xpath('//*[@id="content"]/div/div[1]/div[2]/div[1]/div/text()').extract()[0]        price = xml.xpath('//*[@id="content"]/div/div[1]/div[2]/div[2]/div/div/span/em/text()').extract()[0]        # a = re.compile('class="colorcur" color-id="(\d*)" title="(.*)">')        # color = re.findall(a, response)        color = xml.xpath('//li[@class="colorcur"]/@title').extract()[0]        szie = xml.xpath('//div[@class="pro_size"]//ul/li/a/text()').extract()        proimg = xml.xpath('//div[@class="proImg"]//img/@src').extract()        prodata1 = xml.xpath('//div[@class="proTableinfo"]//th//text()').extract()        prodata2 = xml.xpath('//div[@class="proTableinfo"]//td//text()').extract()        brandstory = xml.xpath('//div[@class="proBrand_l"]/p/text()').extract()[0]        brandimg = xml.xpath('//div[@class="proBrand_r"]/img/@src').extract()[0]        # print brandStory        meiciid = xml.xpath('//td[@class="product_sku"]/text()').extract()[0]        # print brand,productitle,price        # print color,szie        # print proimg        # print len(prodata1),len(prodata2)        # print brandstory        # print brandimg        # print meiciid        del prodata2[9]        del prodata2[10]        key = []        for i in prodata1:            # i = "'" + i + "'"            i=reg.sub("",i)            key.append(i)        value = []        for j in prodata2:            # j = "'" + j + "'"            j = reg.sub("", j)            value.append(j)        prodata = dict(zip(key, value))        prodata = json.dumps(prodata, ensure_ascii=False)        # print prodata        item = items.JsArticleItem()        item['brand'] = brand        item['productitle'] = productitle        item['price'] = price        item['color'] = color        item['szie'] = str(szie)        item['proimg'] = str(proimg)        item['prodata'] = prodata        item['brandstory'] = brandstory        item['brandimg'] = brandimg        item['meiciid'] = meiciid        yield item复制代码

转载地址:http://dsacl.baihongyu.com/

你可能感兴趣的文章
配置Postfix支持虚拟用户
查看>>
js 强制刷新页面(父页面)
查看>>
关于Java加载属性文件放在web容器不好使的解决办法
查看>>
一次生产环境web服务迁移故障总结与反思
查看>>
Linux下安装node.js
查看>>
C# Func与Action总结
查看>>
2016/10/20
查看>>
远程yum和本地yum的安装
查看>>
简单无痛解决Linux下修改/etc/fstab导致无法启动的问题
查看>>
ZFS数据缓存(Part I)
查看>>
mongod.service start operation timed out报错
查看>>
VMM2012应用指南之1-实验环境概述与准备
查看>>
转换CMSampleBuffer 的数据流为ffmpeg 的AVPicture
查看>>
FSMO五种角色的作用
查看>>
Exchange2013 证书服务管理(三)
查看>>
java多线程之模拟死锁
查看>>
windows下route路由相关命令
查看>>
源码之LinkedHashSet
查看>>
我的友情链接
查看>>
常用js+jquery集锦
查看>>