img_crawler/test.py
2020-12-02 22:31:54 +08:00

114 lines
3.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
__author__ = 'Peng'
from bs4 import BeautifulSoup,Comment
import urllib.request.Request
from urllib.request import urlopen,HTTPError
import json
import datetime
import logging
import sys
import re
import time
import random
import ConfigParser
import threading
import threading
import Queue
import time
from time import ctime,sleep
#获取infinity所有壁纸
# 配置日志信息 输出到控制台
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',
stream=sys.stdout)
def getInfinity(i, data):
#创建请求头
# headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0",
# "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
# "Accept-Language":"zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
# "Accept-Encoding":"gzip, deflate, br",
# "Host":"img.infinitynewtab.com",
# "Upgrade-Insecure-Requests":1}
try:
url = data[i]
request = urllib.Request(url)
html = urlopen(request)
img_data = html.read()
base_path = os.path.abspath('.')
folder_path = os.path.join(base_path,str(int(index/500)))
fire_path = os.path.join(folder_path,str(index%500)+'.jpg')
f = open(fire_path, 'wb')
f.write(img_data)
print (u"正在保存的图片为",imgName)
f.close()
except HTTPError as e:
print(e)
class worker(threading.Thread):
def __init__(self,id, name, data):
threading.Thread.__init__(self)
self.id = id
self.name = name
self.thread_stop = False
self.data = data
def run(self):
while not self.thread_stop:
print("thread%d %s: waiting for task" %(self.ident,self.name))
getInfinity(self.id, data)
self.thread_stop = True
def stop(self):
self.thread_stop = True
def createThread(id,Tnum,data):
threadList = []
for i in range(1,Tnum):
threadList.append(str(i))
threads = []
# 创建新线程
for tName in threadList:
thread = worker(id, tName, data)
thread.start()
threads.append(thread)
id += 1
# 等待所有线程完成
# join()方法的位置是在for循环外的也就是说必须等待for循环里的两个进程都结束后才去执行主进程。
#join的作用是在子线程完成运行之前这个子线程的父线程将一直被阻塞。
for t in threads:
t.join()
# sleep(0.5)
def startDownload(l,Tnum, data):
for i in range(1,l,Tnum):
createThread(i,Tnum, data)
# 在当前文件夹下生成文件夹
def createFileFolder(data):
# 获取当前绝对地址
base_path = os.path.abspath('.')
index = 1
for i in data:
if index % 500 == 0:
# 拼接新地址
path = os.path.join(base_path,str(int(index/500)))
# 创建文件夹
os.mkdir(path)
# 打开文件取出内容
fp = open('./allData.json')
data = json.load(fp)
fp.close()
# 建立文件夹
createFileFolder(data)
startDownload(len(data),50, data)