img_crawler/index.py
2020-12-02 22:31:54 +08:00

104 lines
3.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
__author__ = 'Peng'
from bs4 import BeautifulSoup,Comment
import urllib.request.Request
from urllib.request import urlopen,HTTPError
import json
import datetime
import logging
import sys
import re
import time
import random
import ConfigParser
import threading
import threading
import Queue
import time
from time import ctime,sleep
#获取infinity所有壁纸
# 配置日志信息 输出到控制台
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',
stream=sys.stdout)
def getInfinity(i):
#创建请求头
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language":"zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Accept-Encoding":"gzip, deflate, br",
"Host":"img.infinitynewtab.com",
"Upgrade-Insecure-Requests":1}
try:
url="https://img.infinitynewtab.com/wallpaper/"+str(i)+".jpg"
request = urllib.Request(url,headers=headers)
html = urlopen(request)
data = html.read()
imgName = str(i)+".jpg"
f = open("D:\\infinity2\\"+imgName, 'wb')
f.write(data)
print (u"正在保存的图片为",imgName)
f.close()
except HTTPError as e:
print(e)
class worker(threading.Thread):
def __init__(self,id, name):
threading.Thread.__init__(self)
self.id = id
self.name = name
self.thread_stop = False
def run(self):
while not self.thread_stop:
print("thread%d %s: waiting for task" %(self.ident,self.name))
getInfinity(self.id)
self.thread_stop = True
def stop(self):
self.thread_stop = True
def createThread(id,Tnum):
threadList = []
for i in range(1,Tnum):
threadList.append(str(i))
threads = []
# 创建新线程
for tName in threadList:
thread = worker(id, tName)
thread.start()
threads.append(thread)
id += 1
# 等待所有线程完成
# join()方法的位置是在for循环外的也就是说必须等待for循环里的两个进程都结束后才去执行主进程。
#join的作用是在子线程完成运行之前这个子线程的父线程将一直被阻塞。
for t in threads:
t.join()
# sleep(0.5)
def startDownload(l,Tnum):
for i in range(1,l,Tnum):
createThread(i,Tnum)
startDownload(4050,100)
#01
#这并不是我想要的多线程高并发目前只是一次起了100个进程但是起完100个进程之后还是要等待大约3到5秒
#再发起下一次请求,我想要的是它一直在发请求,一直不断的在读写下载。
#02
#在01思考的基础上我注释掉join()方法就实现了我的目标
#03
#02似乎存在连接丢失的情况有join时能够保证下载所有的目标图片没有join大概有1/4的连接丢失
#所以还是加上join等1批线程执行完再走第二批比较好