百度图片爬虫-python版

it2024-12-24 49

1 # coding:utf-8 2 3 """ 4 5 Created on 2015-9-17 6 7 8 9 @author: huangxie 10 11 """ 12 13 import time,math,os,re,urllib,urllib2,cookielib 14 15 from bs4 import BeautifulSoup 16 17 import time 18 19 import re 20 21 import uuid 22 23 import json 24 25 from threading import Thread 26 27 from Queue import Queue 28 29 import MySQLdb as mdb 30 31 import sys 32 33 import threading 34 35 import utils 36 37 import imitate_browser 38 39 from MySQLdb.constants.REFRESH import STATUS 40 41 reload(sys) 42 43 sys.setdefaultencoding( ' utf-8 ') 44 45 46 47 DB_HOST = ' 127.0.0.1 ' 48 49 DB_USER = ' root ' 50 51 DB_PASS = ' root ' 52 53 proxy = {u ' http ':u ' 222.39.64.13:8118 '} 54 55 TOP_URL= " http://image.baidu.com/i?tn=resultjsonavatarnew&ie=utf-8&word={word}&pn={pn}&rn={rn} " 56 57 KEYWORD_URL= " https://www.baidu.com/s?ie=utf-8&f=8&tn=baidu&wd={wd} " 58 59 60 61 """ 62 63 i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 64 65 'Accept':'json;q=0.9,*/*;q=0.8', 66 67 'Accept-Charset':'utf-8;q=0.7,*;q=0.3', 68 69 'Accept-Encoding':'gzip', 70 71 'Connection':'close', 72 73 'Referer':None #注意如果依然不能抓取的话，这里可以设置抓取网站的host 74 75 } 76 77 """ 78 79 i_headers = { ' User-Agent ': ' Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48 '} 80 81 82 83 def GetDateString(): 84 85 x = time.localtime(time.time()) 86 87 foldername = str(x. __getattribute__( " tm_year "))+ " - "+str(x. __getattribute__( " tm_mon "))+ " - "+str(x. __getattribute__( " tm_mday ")) 88 89 return foldername 90 91 92 93 class BaiduImage(threading.Thread): 94 95 96 97 def __init__(self): 98 99 Thread. __init__(self) 100 101 self.browser=imitate_browser.BrowserBase() 102 103 self.chance=0 104 105 self.chance1=0 106 107 self.request_queue=Queue() 108 109 self.wait_ana_queue=Queue() 110 111 # self.key_word_queue.put((("动态图", 0, 24))) 112 113 self.count=0 114 115 self.mutex = threading.RLock() # 可重入锁，使单线程可以再次获得已经获得的锁 116 117 self.commit_count=0 118 119 self.ID=500 120 121 self.next_proxy_set = set() 122 123 self.dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, ' sosogif ', charset= ' utf8 ') 124 125 self.dbconn.autocommit(False) 126 127 self.dbcurr = self.dbconn.cursor() 128 129 self.dbcurr.execute( ' SET NAMES utf8 ') 130 131 132 133 """ 134 135 def run(self): 136 137 while True: 138 139 self.get_pic() 140 141 """ 142 143 144 145 def work(self,item): 146 147 print " start thread ",item 148 149 while True: # MAX_REQUEST条以上则等待 150 151 self.get_pic() 152 153 self.prepare_request() 154 155 156 157 def format_keyword_url(self,keyword): 158 159 160 161 return KEYWORD_URL.format(wd=keyword).encode( ' utf-8 ') 162 163 164 165 def generateSeed(self,url): 166 167 168 169 html = self.browser.openurl(url).read() 170 171 if html: 172 173 try: 174 175 soup = BeautifulSoup(html) 176 177 trs = soup.find( ' div ', id= ' rs ').find( ' table ').find_all( ' tr ') # 获得所有行 178 179 for tr in trs: 180 181 ths=tr.find_all( ' th ') 182 183 for th in ths: 184 185 a=th.find_all( ' a ')[0] 186 187 keyword=a.text.strip() 188 189 if " 动态图 " in keyword or " gif " in keyword: 190 191 print " keyword ",keyword 192 193 self.dbcurr.execute( ' select id from info where word=%s ',(keyword)) 194 195 y = self.dbcurr.fetchone() 196 197 if not y: 198 199 self.dbcurr.execute( ' INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,0,0,0,0) ',(keyword)) 200 201 self.dbconn.commit() 202 203 except: 204 205 pass 206 207 208 209 210 211 def prepare_request(self): 212 213 self.lock() 214 215 self.dbcurr.execute( ' select * from info where status=0 ') 216 217 result = self.dbcurr.fetchone() 218 219 if result: 220 221 id,word,status,page_num,left_num,how_many=result 222 223 self.request_queue.put((id,word,page_num)) 224 225 if page_num==0 and left_num==0 and how_many==0: 226 227 url=self.format_keyword_url(word) 228 229 self.generateSeed(url) 230 231 html= "" 232 233 try: 234 235 url=self.format_top_url(word, page_num, 24) 236 237 html = self.browser.openurl(url).read() 238 239 except Exception as err: 240 241 print " err ",err 242 243 # pass 244 245 if html!= "": 246 247 how_many=self.how_many(html) 248 249 print " how_many ",how_many 250 251 if how_many==None: 252 253 how_many=0 254 255 t=math.ceil(how_many/24*100) # 只要前1/100即可 256 257 num = int(t) 258 259 for i in xrange(0,num-1): 260 261 self.dbcurr.execute( ' INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,%s,%s,%s,%s) ',(word,0,i*24,num-i,how_many)) 262 263 self.dbcurr.execute( ' update info SET status=1 WHERE id=%s ',(id)) # 置为已经访问 264 265 self.dbconn.commit() 266 267 self.unlock() 268 269 270 271 272 273 def start_work(self,req_max): 274 275 for item in xrange(req_max): 276 277 t = threading.Thread(target=self.work, args=(item,)) 278 279 t.setDaemon(True) 280 281 t.start() 282 283 284 285 def lock(self): # 加锁 286 287 self.mutex.acquire() 288 289 290 291 def unlock(self): # 解锁 292 293 self.mutex.release() 294 295 296 297 def get_para(self,url,key): 298 299 values = url.split( ' ? ')[-1] 300 301 for key_value in values.split( ' & '): 302 303 value=key_value.split( ' = ') 304 305 if value[0]==key: 306 307 return value[1] 308 309 return None 310 311 312 313 def makeDateFolder( self,par,child): 314 315 # self.lock() 316 317 if os.path.isdir( par ): 318 319 path=par + ' // ' + GetDateString() 320 321 newFolderName = path+ ' // '+child 322 323 if not os.path.isdir(path): 324 325 os.mkdir(path) 326 327 if not os.path.isdir( newFolderName ): 328 329 os.mkdir( newFolderName ) 330 331 return newFolderName 332 333 else: 334 335 return par 336 337 # self.unlock() 338 339 340 341 def parse_json(self,data): 342 343 344 345 ipdata = json.loads(data) 346 347 try: 348 349 if ipdata[ ' imgs ']: 350 351 for n in ipdata[ ' imgs ']: # data子项 352 353 if n[ ' objURL ']: 354 355 try: 356 357 proxy_support = urllib2.ProxyHandler(proxy) 358 359 opener = urllib2.build_opener(proxy_support) 360 361 urllib2.install_opener(opener) 362 363 # print "proxy",proxy 364 365 self.lock() 366 367 self.dbcurr.execute( ' select ID from pic_info where objURL=%s ', (n[ ' objURL '])) 368 369 y = self.dbcurr.fetchone() 370 371 # print "y=",y 372 373 if y: 374 375 print " database exist " 376 377 self.unlock() # continue 前解锁 378 379 continue 380 381 else: 382 383 real_extension=utils.get_extension(n[ ' objURL ']) 384 385 req = urllib2.Request(n[ ' objURL '],headers=i_headers) 386 387 resp = urllib2.urlopen(req,None,5) 388 389 dataimg=resp.read() 390 391 name=str(uuid.uuid1()) 392 393 filename= "" 394 395 if len(real_extension)>4: 396 397 real_extension= " .gif " 398 399 real_extension=real_extension.lower() 400 401 if real_extension== " .gif ": 402 403 filename =self.makeDateFolder( " E://sosogif ", " d "+str(self.count % 60))+ " // "+name+ " -www.sosogif.com-搜搜gif贡献 "+real_extension 404 405 self.count+=1 406 407 else: 408 409 filename =self.makeDateFolder( " E://sosogif ", " o "+str(self.count % 20))+ " // "+name+ " -www.sosogif.com-搜搜gif贡献 "+real_extension 410 411 self.count+=1 412 413 """ 414 415 name=str(uuid.uuid1()) 416 417 filename="" 418 419 if len(real_extension)>4: 420 421 real_extension=".gif" 422 423 filename =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension 424 425 self.count+=1 426 427 """ 428 429 try: 430 431 if not os.path.exists(filename): 432 433 file_object = open(filename, ' w+b ') 434 435 file_object.write(dataimg) 436 437 file_object.close() 438 439 self.anaylis_info(n,filename,real_extension) # 入库操作 440 441 else: 442 443 print " file exist " 444 445 except IOError,e1: 446 447 print " e1= ",e1 448 449 pass 450 451 self.unlock() 452 453 except IOError,e2: 454 455 # print "e2=",e2 456 457 pass 458 459 self.chance1+=1 460 461 except Exception as parse_error: 462 463 print " parse_error ",parse_error 464 465 pass 466 467 468 469 def title_dealwith(self,title): 470 471 472 473 # print "title",title 474 475 a=title.find( " <strong> ") 476 477 temp1=title[0:a] 478 479 b=title.find( " </strong> ") 480 481 temp2=title[a+8:b] 482 483 temp3=title[b+9:len(title)] 484 485 return (temp1+temp2+temp3).strip() 486 487 488 489 def anaylis_info(self,n,filename,real_extension): 490 491 print " success. " 492 493 494 495 # if self.wait_ana_queue.qsize()!=0: 496 497 # n,filename,real_extension=self.wait.ana_queue.get() 498 499 # self.lock() 500 501 objURL=n[ ' objURL '] # 图片地址 502 503 fromURLHost=n[ ' fromURLHost '] # 来源网站 504 505 width=n[ ' width '] # 宽度 506 507 height=n[ ' height '] # 高度 508 509 di=n[ ' di '] # 用来唯一标识 510 511 type=n[ ' type '] # 格式 512 513 fromPageTitle=n[ ' fromPageTitle '] # 来自网站 514 515 keyword=self.title_dealwith(fromPageTitle) 516 517 cs=n[ ' cs '] # 未知 518 519 os=n[ ' os '] # 未知 520 521 temp = time.time() 522 523 x = time.localtime(float(temp)) 524 525 acTime = time.strftime( " %Y-%m-%d %H:%M:%S ",x) # 爬取时间 526 527 self.dbcurr.execute( ' select ID from pic_info where cs=%s ', (cs)) 528 529 y = self.dbcurr.fetchone() 530 531 if not y: 532 533 print ' add pic ',filename 534 535 self.commit_count+=1 536 537 self.dbcurr.execute( ' INSERT INTO pic_info(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ',(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension)) 538 539 if self.commit_count==10: 540 541 self.dbconn.commit() 542 543 self.commit_count=0 544 545 # self.unlock() 546 547 548 549 550 551 def format_top_url(self,word,pn,rn): 552 553 554 555 url = TOP_URL.format(word=word, pn=pn,rn=rn).encode( ' utf-8 ') 556 557 return url 558 559 560 561 def how_many(self,data): 562 563 try: 564 565 ipdata = json.loads(data) 566 567 if ipdata[ ' displayNum ']>0: 568 569 how_many=ipdata[ ' displayNum '] 570 571 return int(how_many) 572 573 else: 574 575 return 0 576 577 except Exception as e: 578 579 pass 580 581 582 583 def get_pic(self): 584 585 """ 586 587 word="gif" 588 589 pn=0 590 591 rn=24 592 593 if self.key_word_queue.qsize()!=0: 594 595 word,pn,rn=self.key_word_queue.get() 596 597 url=self.format_top_url(word,pn,rn) 598 599 global proxy 600 601 if url: 602 603 try: 604 605 html="" 606 607 try: 608 609 req = urllib2.Request(url,headers=i_headers) 610 611 response = urllib2.urlopen(req, None,5) 612 613 #print "url",url 614 615 html = self.browser.openurl(url).read() 616 617 except Exception as err: 618 619 print "err",err 620 621 #pass 622 623 if html: 624 625 how_many=self.how_many(html) 626 627 #how_many=10000 628 629 print "how_many",how_many 630 631 word=self.get_para(url,"word") 632 633 rn=int(self.get_para(url,"rn")) 634 635 t=math.ceil(how_many/rn) 636 637 num = int(t) 638 639 for item in xrange(0,num-1): 640 641 """ 642 643 try: 644 645 global proxy 646 647 print " size of queue ",self.request_queue.qsize() 648 649 if self.request_queue.qsize()!=0: 650 651 id,word,page_num = self.request_queue.get() 652 653 u=self.format_top_url(word,page_num,24) 654 655 self.lock() 656 657 self.dbcurr.execute( ' update info SET status=1 WHERE id=%s ',(id)) 658 659 self.dbconn.commit() 660 661 if self.chance >0 or self.chance1>1: # 任何一个出问题都给换代理 662 663 if self.ID % 100==0: 664 665 self.dbcurr.execute( " select count(*) from proxy ") 666 667 for r in self.dbcurr: 668 669 count=r[0] 670 671 if self.ID>count: 672 673 self.ID=50 674 675 self.dbcurr.execute( " select * from proxy where ID=%s ",(self.ID)) 676 677 results = self.dbcurr.fetchall() 678 679 for r in results: 680 681 protocol=r[1] 682 683 ip=r[2] 684 685 port=r[3] 686 687 pro=(protocol,ip+ " : "+port) 688 689 if pro not in self.next_proxy_set: 690 691 self.next_proxy_set.add(pro) 692 693 self.chance=0 694 695 self.chance1=0 696 697 self.ID+=1 698 699 self.unlock() 700 701 proxy_support = urllib2.ProxyHandler(proxy) 702 703 opener = urllib2.build_opener(proxy_support) 704 705 urllib2.install_opener(opener) 706 707 html= "" 708 709 try: 710 711 req = urllib2.Request(u,headers=i_headers) 712 713 # print "u=",u 714 715 response = urllib2.urlopen(req, None,5) 716 717 html = response.read() 718 719 if html: 720 721 # print "html",type(html) 722 723 self.parse_json(html) 724 725 except Exception as ex1: 726 727 # print "error=",ex1 728 729 pass 730 731 self.chance+=1 732 733 if self.chance>0 or self.chance1>1: 734 735 if len(self.next_proxy_set)>0: 736 737 protocol,socket=self.next_proxy_set.pop() 738 739 proxy= {protocol:socket} 740 741 print " change proxy finished<< ",proxy,self.ID 742 743 except Exception as e: 744 745 print " error1 ",e 746 747 pass 748 749 750 751 if __name__ == ' __main__ ': 752 753 754 755 app = BaiduImage() 756 757 app.start_work(80) 758 759 # app.generateSeed() 760 761 while 1: 762 763 pass

转载于:https://www.cnblogs.com/jym-sunshine/p/5476900.html

相关资源：利用Python爬虫批量下载百度图库图片

最新回复(0)