python scrapy学习笔记
#!/usr/bin/python#coding:utf-8
import scrapy
from scrapy.selector import Selector
import os
import requests
class NextSpider(scrapy.spiders.Spider):
name = 'nextspider'
start_urls = ["http://group.jobbole.com/27740/#comm-77724"]
def parse(self,response):
items_selector = Selector(response=response)
items = items_selector.xpath('//ul[@class="cmnt-list"]/li')
# print items
for i in range(len(items)):
srcs = items_selector.xpath('//ul[@class="cmnt-list"]/li[%d]//div[@class="cmnt-header"]/a/img/@src'%i).extract()
names = items_selector.xpath('//ul[@class="cmnt-list"]/li[%d]//div[@class="cmnt-header"]/div/span/a/text()'%i).extract()
msgs = items_selector.xpath('//ul[@class="cmnt-list"]/li[%d]//div[@class="cmnt-body"]/p/text()'%i).extract()
if srcs and names and msgs:
try:
img_url = srcs
filename = names.encode('utf-8')
msg = ','.join(])
print '用户ID: {}\n发表信息: {}'.format(filename,msg)
img_dir = 'imgs'
path = os.path.join(os.getcwd(),img_dir,filename+'.png')
r = requests.get(img_url)
with open(path,'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
f.close()
except Exception,e:
print '错误: {}'.format(e)
页:
[1]