#-*- coding:utf-8 -*-
import os
import urllib2
import cStringIO
import Image
import re
import requests
from lxml import etree
Mozilla_header={
'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
links=[] #遍历url的地址
k=1
print u'请输入最后的页数:'
endPage=int(raw_input()) #最终的页数
for j in range(1,endPage+1):
if not os.path.exists('image'): #路径不存在时创建一个
os.makedirs('image')
url='http://www.mzitu.com/page/'+str(j) #页数的url地址
req=urllib2.Request(url, headers = Mozilla_header) #读取首页的内容
html = urllib2.urlopen(req).read()
selector=etree.HTML(html) #转换为xml,用于在接下来识别
links=selector.xpath('//li/a[@target="_blank"]/@href') #抓取当前页面的所有帖子的url
hot=selector.xpath('//li//span[@class="view"]/text()')
for i in range(len(links)):
h=''
for a in re.findall('\d+',hot[i]):
h+=a
if int(h)>500000:
req2 = urllib2.Request(links[i], headers = Mozilla_header)
html2 = urllib2.urlopen(req2).read() #读取当前页面的内容
selector2=etree.HTML(html2) #转换为xml用于识别
page=selector2.xpath('//div[@class="pagenavi"]//span[last()-1]/text()')
break_flag=0
#此处就是遍历下载
for k in range(1,int(page[0])):
if break_flag:
break
req3 = urllib2.Request(links[i]+'/'+str(k), headers = Mozilla_header)
html3 = urllib2.urlopen(req3).read() #读取当前页面的内容
selector3=etree.HTML(html3) #转换为xml用于识别
link_pic = selector3.xpath('//div[@class="main-image"]//@src')
for each in link_pic:
t=3
req4 = urllib2.Request(each, headers = Mozilla_header)
while(t):
try:
image1=urllib2.urlopen(req4,timeout=10).read() #读取图片的内容
tmpIm = cStringIO.StringIO(image1)
pic_name='image/'+each[7:].replace('/','_')
if os.path.exists(pic_name):
break_flag=1
break
else:
print u'正在下载%s'%each
fp=open(pic_name,'wb') #下载在当前目录下 image文件夹内,图片格式为jpg
fp.write(image1) #写入图片
fp.close()
break
except:
t=t-1
print u'下载完成!'