img_crawler/cache.py

import urllib.request
import re
import os
import urllib
import json

# 制造新的url
def makeNewUrl(url):
  url_1 = re.findall('com/(.*)/m',url)
  url_2 =  re.findall('/m/(.*)null',url)
  if len(url_1) == 0 or len(url_2) == 0:
    return False
  new_url = "http://r.photo.store.qq.com/"
  new_url += url_1[0]
  new_url += "/r/"
  new_url += url_2[0]
  return new_url

def fun(blocknum,blocksize,totalsize):
    """
    blocknum:当前的块编号
    blocksize:每次传输的块大小
    totalsize:网页文件总大小
    """
    percent = blocknum*blocksize/totalsize
    if percent > 1.0:
        percent = 1.0
    percent = percent*100
    print("download : %.2f%%" %(percent))

# 读取json文件
fp = open('./imgs1.json')
data = json.load(fp)
# 初始化计数
index = 1

for img_url in data:
  print('第%s个开始'%(index))
  if index > 10:
    print("长度终止")
    break
  new_url = makeNewUrl(img_url)
  if not new_url:
    print("地址提取失败")
    continue
  print('开始请求')
  urllib.request.urlretrieve(new_url, '{0}{1}.jpg'.format('D:\\Code\\imgCrawler\\mapDepot1\\', index), fun)
  index +=1