qqzone_crawler/crawler_old.py
2020-12-02 22:45:37 +08:00

53 lines
1.3 KiB
Python

import urllib.request
import re
import os
import urllib
import json
# 制造新的url
def makeNewUrl(url):
url_1 = re.findall('com/(.*)/m',url)
url_2 = re.findall('/m/(.*)null',url)
if len(url_1) == 0 or len(url_2) == 0:
return False
new_url = "http://r.photo.store.qq.com/"
new_url += url_1[0]
new_url += "/r/"
new_url += url_2[0]
return new_url
def fun(blocknum,blocksize,totalsize):
"""
blocknum:当前的块编号
blocksize:每次传输的块大小
totalsize:网页文件总大小
"""
percent = blocknum*blocksize/totalsize
if percent > 1.0:
percent = 1.0
percent = percent*100
print("download : %.2f%%" %(percent))
def start():
for i in range(1,7):
print('%s个文件夹开始'%(i))
map_url = './imgs'+str(i)+'.json'
storage_url = 'D:\\Code\\python\\qqzone\\img' + str(i) + '\\'
# 读取json文件
fp = open(map_url)
data = json.load(fp)
# 初始化计数
index = 1
for img_url in data:
print('%s个图片开始'%(index))
if not img_url:
print('空地址')
continue
new_url = makeNewUrl(img_url)
if not new_url:
print("地址提取失败")
continue
print('开始请求')
urllib.request.urlretrieve(new_url, '{0}{1}.jpg'.format(storage_url, index), fun)
index +=1
start()