#coding=utf-8
#抓取淘宝淘女郎的照片并且对不同人进行分类
import urllib2
import re
import os
from time import sleep
from sys import exit
import random
if not os.path.exists('taobaonv'):
os.mkdir('taobaonv')
os.chdir(os.path.join(os.getcwd(),'taobaonv'))
path=os.getcwd()
url_base='https://mm.taobao.com/json/request_top_list.htm?page='
global count
global images
dirList=[1]
#匹配出图片网址、照片主名字、照片主地址
pattern=r'<div class="list-item">.*?<img src="(.*?)".*?>.*?<a class=.*?"_blank">(.*?)</a>.*?<span>(.*?)</span>'
header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'}
#获取该网页信息
def getData():
#print 'getData start'
url=url_base+str(count)
request=urllib2.Request(url,headers=header)
response=urllib2.urlopen(request)
global pageData
pageData=response.read()
#print 'getData end'
#print len(pageData)
#存储照片
def saveImage(imageInfor):
#print 'saveImage start'
u='http:'+imageInfor[0]
imageInfo=(u,imageInfor[1],imageInfor[2])
imageData=urllib2.urlopen(imageInfo[0]).read()
#当不存在该地区文件夹时
if imageInfo[2] not in os.listdir(path):
os.mkdir(imageInfo[2])
#dirList.append(imageInfo[2])
path1=os.path.join(path,imageInfo[2])+'\\'
temp=path1+imageInfo[1]+'.jpg'
#print a
f=open(temp,'wb')
f.write(imageData)
f.close()
#print 'saveImage end'
#根据正则表达式匹配出当前页面的所有淘女郎的信息
def getImages():
#print 'getImages start'
images=re.findall(pattern,pageData,re.S)
for i in images:
saveImage(i)
#print 'getImages end'
#抓取所有图片
count=3937
while count<=4320:
print count
try:
getData()#获取当前页面的所有信息
getImages()#获取图片信息并且将他们存到对应位置
except Exception,ex:
print Exception,":",ex
count+=1
#t=random.uniform(1,2)
#print 'rand time: ',t
#sleep(t)