1
#
coding:utf-8
2
3
"""
4
5
Created on 2015-9-17
6
7
8
9
@author: huangxie
10
11
"""
12
13
import time,math,os,re,urllib,urllib2,cookielib
14
15
from bs4
import BeautifulSoup
16
17
import time
18
19
import re
20
21
import uuid
22
23
import json
24
25
from threading
import Thread
26
27
from Queue
import Queue
28
29
import MySQLdb as mdb
30
31
import sys
32
33
import threading
34
35
import utils
36
37
import imitate_browser
38
39
from MySQLdb.constants.REFRESH
import STATUS
40
41 reload(sys)
42
43 sys.setdefaultencoding(
'
utf-8
')
44
45
46
47 DB_HOST =
'
127.0.0.1
'
48
49 DB_USER =
'
root
'
50
51 DB_PASS =
'
root
'
52
53 proxy = {u
'
http
':u
'
222.39.64.13:8118
'}
54
55 TOP_URL=
"
http://image.baidu.com/i?tn=resultjsonavatarnew&ie=utf-8&word={word}&pn={pn}&rn={rn}
"
56
57 KEYWORD_URL=
"
https://www.baidu.com/s?ie=utf-8&f=8&tn=baidu&wd={wd}
"
58
59
60
61
"""
62
63
i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
64
65
'Accept':'json;q=0.9,*/*;q=0.8',
66
67
'Accept-Charset':'utf-8;q=0.7,*;q=0.3',
68
69
'Accept-Encoding':'gzip',
70
71
'Connection':'close',
72
73
'Referer':None #注意如果依然不能抓取的话,这里可以设置抓取网站的host
74
75
}
76
77
"""
78
79 i_headers = {
'
User-Agent
':
'
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48
'}
80
81
82
83
def GetDateString():
84
85 x = time.localtime(time.time())
86
87 foldername = str(x.
__getattribute__(
"
tm_year
"))+
"
-
"+str(x.
__getattribute__(
"
tm_mon
"))+
"
-
"+str(x.
__getattribute__(
"
tm_mday
"))
88
89
return foldername
90
91
92
93
class BaiduImage(threading.Thread):
94
95
96
97
def
__init__(self):
98
99 Thread.
__init__(self)
100
101 self.browser=imitate_browser.BrowserBase()
102
103 self.chance=0
104
105 self.chance1=0
106
107 self.request_queue=Queue()
108
109 self.wait_ana_queue=Queue()
110
111
#
self.key_word_queue.put((("动态图", 0, 24)))
112
113 self.count=0
114
115 self.mutex = threading.RLock()
#
可重入锁,使单线程可以再次获得已经获得的锁
116
117 self.commit_count=0
118
119 self.ID=500
120
121 self.next_proxy_set = set()
122
123 self.dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS,
'
sosogif
', charset=
'
utf8
')
124
125 self.dbconn.autocommit(False)
126
127 self.dbcurr = self.dbconn.cursor()
128
129 self.dbcurr.execute(
'
SET NAMES utf8
')
130
131
132
133
"""
134
135
def run(self):
136
137
while True:
138
139
self.get_pic()
140
141
"""
142
143
144
145
def work(self,item):
146
147
print
"
start thread
",item
148
149
while True:
#
MAX_REQUEST条以上则等待
150
151 self.get_pic()
152
153 self.prepare_request()
154
155
156
157
def format_keyword_url(self,keyword):
158
159
160
161
return KEYWORD_URL.format(wd=keyword).encode(
'
utf-8
')
162
163
164
165
def generateSeed(self,url):
166
167
168
169 html = self.browser.openurl(url).read()
170
171
if html:
172
173
try:
174
175 soup = BeautifulSoup(html)
176
177 trs = soup.find(
'
div
', id=
'
rs
').find(
'
table
').find_all(
'
tr
')
#
获得所有行
178
179
for tr
in trs:
180
181 ths=tr.find_all(
'
th
')
182
183
for th
in ths:
184
185 a=th.find_all(
'
a
')[0]
186
187 keyword=a.text.strip()
188
189
if
"
动态图
"
in keyword
or
"
gif
"
in keyword:
190
191
print
"
keyword
",keyword
192
193 self.dbcurr.execute(
'
select id from info where word=%s
',(keyword))
194
195 y = self.dbcurr.fetchone()
196
197
if
not y:
198
199 self.dbcurr.execute(
'
INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,0,0,0,0)
',(keyword))
200
201 self.dbconn.commit()
202
203
except:
204
205
pass
206
207
208
209
210
211
def prepare_request(self):
212
213 self.lock()
214
215 self.dbcurr.execute(
'
select * from info where status=0
')
216
217 result = self.dbcurr.fetchone()
218
219
if result:
220
221 id,word,status,page_num,left_num,how_many=result
222
223 self.request_queue.put((id,word,page_num))
224
225
if page_num==0
and left_num==0
and how_many==0:
226
227 url=self.format_keyword_url(word)
228
229 self.generateSeed(url)
230
231 html=
""
232
233
try:
234
235 url=self.format_top_url(word, page_num, 24)
236
237 html = self.browser.openurl(url).read()
238
239
except Exception as err:
240
241
print
"
err
",err
242
243
#
pass
244
245
if html!=
"":
246
247 how_many=self.how_many(html)
248
249
print
"
how_many
",how_many
250
251
if how_many==None:
252
253 how_many=0
254
255 t=math.ceil(how_many/24*100)
#
只要前1/100即可
256
257 num = int(t)
258
259
for i
in xrange(0,num-1):
260
261 self.dbcurr.execute(
'
INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,%s,%s,%s,%s)
',(word,0,i*24,num-i,how_many))
262
263 self.dbcurr.execute(
'
update info SET status=1 WHERE id=%s
',(id))
#
置为已经访问
264
265 self.dbconn.commit()
266
267 self.unlock()
268
269
270
271
272
273
def start_work(self,req_max):
274
275
for item
in xrange(req_max):
276
277 t = threading.Thread(target=self.work, args=(item,))
278
279 t.setDaemon(True)
280
281 t.start()
282
283
284
285
def lock(self):
#
加锁
286
287 self.mutex.acquire()
288
289
290
291
def unlock(self):
#
解锁
292
293 self.mutex.release()
294
295
296
297
def get_para(self,url,key):
298
299 values = url.split(
'
?
')[-1]
300
301
for key_value
in values.split(
'
&
'):
302
303 value=key_value.split(
'
=
')
304
305
if value[0]==key:
306
307
return value[1]
308
309
return None
310
311
312
313
def makeDateFolder( self,par,child):
314
315
#
self.lock()
316
317
if os.path.isdir( par ):
318
319 path=par +
'
//
' + GetDateString()
320
321 newFolderName = path+
'
//
'+child
322
323
if
not os.path.isdir(path):
324
325 os.mkdir(path)
326
327
if
not os.path.isdir( newFolderName ):
328
329 os.mkdir( newFolderName )
330
331
return newFolderName
332
333
else:
334
335
return par
336
337
#
self.unlock()
338
339
340
341
def parse_json(self,data):
342
343
344
345 ipdata = json.loads(data)
346
347
try:
348
349
if ipdata[
'
imgs
']:
350
351
for n
in ipdata[
'
imgs
']:
#
data子项
352
353
if n[
'
objURL
']:
354
355
try:
356
357 proxy_support = urllib2.ProxyHandler(proxy)
358
359 opener = urllib2.build_opener(proxy_support)
360
361 urllib2.install_opener(opener)
362
363
#
print "proxy",proxy
364
365 self.lock()
366
367 self.dbcurr.execute(
'
select ID from pic_info where objURL=%s
', (n[
'
objURL
']))
368
369 y = self.dbcurr.fetchone()
370
371
#
print "y=",y
372
373
if y:
374
375
print
"
database exist
"
376
377 self.unlock()
#
continue 前解锁
378
379
continue
380
381
else:
382
383 real_extension=utils.get_extension(n[
'
objURL
'])
384
385 req = urllib2.Request(n[
'
objURL
'],headers=i_headers)
386
387 resp = urllib2.urlopen(req,None,5)
388
389 dataimg=resp.read()
390
391 name=str(uuid.uuid1())
392
393 filename=
""
394
395
if len(real_extension)>4:
396
397 real_extension=
"
.gif
"
398
399 real_extension=real_extension.lower()
400
401
if real_extension==
"
.gif
":
402
403 filename =self.makeDateFolder(
"
E://sosogif
",
"
d
"+str(self.count % 60))+
"
//
"+name+
"
-www.sosogif.com-搜搜gif贡献
"+real_extension
404
405 self.count+=1
406
407
else:
408
409 filename =self.makeDateFolder(
"
E://sosogif
",
"
o
"+str(self.count % 20))+
"
//
"+name+
"
-www.sosogif.com-搜搜gif贡献
"+real_extension
410
411 self.count+=1
412
413
"""
414
415
name=str(uuid.uuid1())
416
417
filename=""
418
419
if len(real_extension)>4:
420
421
real_extension=".gif"
422
423
filename =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
424
425
self.count+=1
426
427
"""
428
429
try:
430
431
if
not os.path.exists(filename):
432
433 file_object = open(filename,
'
w+b
')
434
435 file_object.write(dataimg)
436
437 file_object.close()
438
439 self.anaylis_info(n,filename,real_extension)
#
入库操作
440
441
else:
442
443
print
"
file exist
"
444
445
except IOError,e1:
446
447
print
"
e1=
",e1
448
449
pass
450
451 self.unlock()
452
453
except IOError,e2:
454
455
#
print "e2=",e2
456
457
pass
458
459 self.chance1+=1
460
461
except Exception as parse_error:
462
463
print
"
parse_error
",parse_error
464
465
pass
466
467
468
469
def title_dealwith(self,title):
470
471
472
473
#
print "title",title
474
475 a=title.find(
"
<strong>
")
476
477 temp1=title[0:a]
478
479 b=title.find(
"
</strong>
")
480
481 temp2=title[a+8:b]
482
483 temp3=title[b+9:len(title)]
484
485
return (temp1+temp2+temp3).strip()
486
487
488
489
def anaylis_info(self,n,filename,real_extension):
490
491
print
"
success.
"
492
493
494
495
#
if self.wait_ana_queue.qsize()!=0:
496
497
#
n,filename,real_extension=self.wait.ana_queue.get()
498
499
#
self.lock()
500
501 objURL=n[
'
objURL
']
#
图片地址
502
503 fromURLHost=n[
'
fromURLHost
']
#
来源网站
504
505 width=n[
'
width
']
#
宽度
506
507 height=n[
'
height
']
#
高度
508
509 di=n[
'
di
']
#
用来唯一标识
510
511 type=n[
'
type
']
#
格式
512
513 fromPageTitle=n[
'
fromPageTitle
']
#
来自网站
514
515 keyword=self.title_dealwith(fromPageTitle)
516
517 cs=n[
'
cs
']
#
未知
518
519 os=n[
'
os
']
#
未知
520
521 temp = time.time()
522
523 x = time.localtime(float(temp))
524
525 acTime = time.strftime(
"
%Y-%m-%d %H:%M:%S
",x)
#
爬取时间
526
527 self.dbcurr.execute(
'
select ID from pic_info where cs=%s
', (cs))
528
529 y = self.dbcurr.fetchone()
530
531
if
not y:
532
533
print
'
add pic
',filename
534
535 self.commit_count+=1
536
537 self.dbcurr.execute(
'
INSERT INTO pic_info(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
',(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension))
538
539
if self.commit_count==10:
540
541 self.dbconn.commit()
542
543 self.commit_count=0
544
545
#
self.unlock()
546
547
548
549
550
551
def format_top_url(self,word,pn,rn):
552
553
554
555 url = TOP_URL.format(word=word, pn=pn,rn=rn).encode(
'
utf-8
')
556
557
return url
558
559
560
561
def how_many(self,data):
562
563
try:
564
565 ipdata = json.loads(data)
566
567
if ipdata[
'
displayNum
']>0:
568
569 how_many=ipdata[
'
displayNum
']
570
571
return int(how_many)
572
573
else:
574
575
return 0
576
577
except Exception as e:
578
579
pass
580
581
582
583
def get_pic(self):
584
585
"""
586
587
word="gif"
588
589
pn=0
590
591
rn=24
592
593
if self.key_word_queue.qsize()!=0:
594
595
word,pn,rn=self.key_word_queue.get()
596
597
url=self.format_top_url(word,pn,rn)
598
599
global proxy
600
601
if url:
602
603
try:
604
605
html=""
606
607
try:
608
609
req = urllib2.Request(url,headers=i_headers)
610
611
response = urllib2.urlopen(req, None,5)
612
613
#print "url",url
614
615
html = self.browser.openurl(url).read()
616
617
except Exception as err:
618
619
print "err",err
620
621
#pass
622
623
if html:
624
625
how_many=self.how_many(html)
626
627
#how_many=10000
628
629
print "how_many",how_many
630
631
word=self.get_para(url,"word")
632
633
rn=int(self.get_para(url,"rn"))
634
635
t=math.ceil(how_many/rn)
636
637
num = int(t)
638
639
for item in xrange(0,num-1):
640
641
"""
642
643
try:
644
645
global proxy
646
647
print
"
size of queue
",self.request_queue.qsize()
648
649
if self.request_queue.qsize()!=0:
650
651 id,word,page_num = self.request_queue.get()
652
653 u=self.format_top_url(word,page_num,24)
654
655 self.lock()
656
657 self.dbcurr.execute(
'
update info SET status=1 WHERE id=%s
',(id))
658
659 self.dbconn.commit()
660
661
if self.chance >0
or self.chance1>1:
#
任何一个出问题都给换代理
662
663
if self.ID % 100==0:
664
665 self.dbcurr.execute(
"
select count(*) from proxy
")
666
667
for r
in self.dbcurr:
668
669 count=r[0]
670
671
if self.ID>count:
672
673 self.ID=50
674
675 self.dbcurr.execute(
"
select * from proxy where ID=%s
",(self.ID))
676
677 results = self.dbcurr.fetchall()
678
679
for r
in results:
680
681 protocol=r[1]
682
683 ip=r[2]
684
685 port=r[3]
686
687 pro=(protocol,ip+
"
:
"+port)
688
689
if pro
not
in self.next_proxy_set:
690
691 self.next_proxy_set.add(pro)
692
693 self.chance=0
694
695 self.chance1=0
696
697 self.ID+=1
698
699 self.unlock()
700
701 proxy_support = urllib2.ProxyHandler(proxy)
702
703 opener = urllib2.build_opener(proxy_support)
704
705 urllib2.install_opener(opener)
706
707 html=
""
708
709
try:
710
711 req = urllib2.Request(u,headers=i_headers)
712
713
#
print "u=",u
714
715 response = urllib2.urlopen(req, None,5)
716
717 html = response.read()
718
719
if html:
720
721
#
print "html",type(html)
722
723 self.parse_json(html)
724
725
except Exception as ex1:
726
727
#
print "error=",ex1
728
729
pass
730
731 self.chance+=1
732
733
if self.chance>0
or self.chance1>1:
734
735
if len(self.next_proxy_set)>0:
736
737 protocol,socket=self.next_proxy_set.pop()
738
739 proxy= {protocol:socket}
740
741
print
"
change proxy finished<<
",proxy,self.ID
742
743
except Exception as e:
744
745
print
"
error1
",e
746
747
pass
748
749
750
751
if
__name__ ==
'
__main__
':
752
753
754
755 app = BaiduImage()
756
757 app.start_work(80)
758
759
#
app.generateSeed()
760
761
while 1:
762
763
pass
转载于:https://www.cnblogs.com/jym-sunshine/p/5476900.html
相关资源:利用Python爬虫批量下载百度图库图片