百度图片爬虫-python版

it2024-12-24  15

  1  # coding:utf-8   2    3  """   4    5  Created on 2015-9-17   6    7      8    9  @author: huangxie  10   11  """  12   13  import time,math,os,re,urllib,urllib2,cookielib   14   15  from bs4  import BeautifulSoup  16   17  import time    18   19  import re  20   21  import uuid  22   23  import json  24   25  from threading  import Thread  26   27  from Queue  import Queue   28   29  import MySQLdb as mdb  30   31  import sys  32   33  import threading  34   35  import utils  36   37  import imitate_browser  38   39  from MySQLdb.constants.REFRESH  import STATUS  40   41 reload(sys)  42   43 sys.setdefaultencoding( ' utf-8 ')  44   45    46   47 DB_HOST =  ' 127.0.0.1 '  48   49 DB_USER =  ' root '  50   51 DB_PASS =  ' root '  52   53 proxy = {u ' http ':u ' 222.39.64.13:8118 '}  54   55 TOP_URL= " http://image.baidu.com/i?tn=resultjsonavatarnew&ie=utf-8&word={word}&pn={pn}&rn={rn} "  56   57 KEYWORD_URL= " https://www.baidu.com/s?ie=utf-8&f=8&tn=baidu&wd={wd} "  58   59    60   61  """  62   63  i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',  64   65                'Accept':'json;q=0.9,*/*;q=0.8',  66   67                'Accept-Charset':'utf-8;q=0.7,*;q=0.3',  68   69                'Accept-Encoding':'gzip',  70   71                'Connection':'close',  72   73                'Referer':None #注意如果依然不能抓取的话,这里可以设置抓取网站的host  74   75              }  76   77  """  78   79 i_headers = { ' User-Agent ': ' Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48 '}  80   81    82   83  def GetDateString():  84   85     x = time.localtime(time.time())  86   87     foldername = str(x. __getattribute__( " tm_year "))+ " - "+str(x. __getattribute__( " tm_mon "))+ " - "+str(x. __getattribute__( " tm_mday "))  88   89      return foldername   90   91    92   93  class BaiduImage(threading.Thread):       94   95    96   97      def  __init__(self):  98   99         Thread. __init__(self) 100  101         self.browser=imitate_browser.BrowserBase() 102  103         self.chance=0 104  105         self.chance1=0 106  107         self.request_queue=Queue() 108  109         self.wait_ana_queue=Queue() 110  111          # self.key_word_queue.put((("动态图", 0, 24))) 112  113         self.count=0 114  115         self.mutex = threading.RLock()  # 可重入锁,使单线程可以再次获得已经获得的锁 116  117         self.commit_count=0 118  119         self.ID=500 120  121         self.next_proxy_set = set() 122  123         self.dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS,  ' sosogif ', charset= ' utf8 ') 124  125         self.dbconn.autocommit(False) 126  127         self.dbcurr = self.dbconn.cursor() 128  129         self.dbcurr.execute( ' SET NAMES utf8 ') 130  131          132  133      """ 134  135      def run(self): 136  137          while True: 138  139              self.get_pic() 140  141       """ 142  143      144  145      def work(self,item): 146  147          print  " start thread ",item 148  149          while True:  # MAX_REQUEST条以上则等待 150  151             self.get_pic() 152  153             self.prepare_request() 154  155      156  157      def format_keyword_url(self,keyword): 158  159    160  161          return KEYWORD_URL.format(wd=keyword).encode( ' utf-8 '162  163             164  165      def generateSeed(self,url): 166  167          168  169         html = self.browser.openurl(url).read() 170  171          if html: 172  173              try: 174  175                 soup = BeautifulSoup(html) 176  177                 trs = soup.find( ' div ', id= ' rs ').find( ' table ').find_all( ' tr '# 获得所有行 178  179                  for tr  in trs: 180  181                     ths=tr.find_all( ' th ') 182  183                      for th  in ths: 184  185                         a=th.find_all( ' a ')[0] 186  187                         keyword=a.text.strip() 188  189                          if  " 动态图 "  in keyword  or  " gif "  in keyword: 190  191                              print  " keyword ",keyword 192  193                             self.dbcurr.execute( ' select id from info where word=%s ',(keyword)) 194  195                             y = self.dbcurr.fetchone() 196  197                              if  not y: 198  199                                 self.dbcurr.execute( ' INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,0,0,0,0) ',(keyword)) 200  201                     self.dbconn.commit() 202  203              except: 204  205                  pass 206  207                  208  209                 210  211      def prepare_request(self): 212  213         self.lock() 214  215         self.dbcurr.execute( ' select * from info where status=0 ') 216  217         result = self.dbcurr.fetchone() 218  219          if result: 220  221             id,word,status,page_num,left_num,how_many=result 222  223             self.request_queue.put((id,word,page_num))  224  225              if page_num==0  and left_num==0  and how_many==0: 226  227                 url=self.format_keyword_url(word) 228  229                 self.generateSeed(url) 230  231                 html= "" 232  233                  try: 234  235                     url=self.format_top_url(word, page_num, 24) 236  237                     html = self.browser.openurl(url).read() 238  239                  except Exception as err: 240  241                      print  " err ",err 242  243                      # pass 244  245                  if html!= "": 246  247                     how_many=self.how_many(html) 248  249                      print  " how_many ",how_many 250  251                      if how_many==None: 252  253                         how_many=0 254  255                     t=math.ceil(how_many/24*100)  # 只要前1/100即可 256  257                     num = int(t) 258  259                      for i   in xrange(0,num-1): 260  261                         self.dbcurr.execute( ' INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,%s,%s,%s,%s) ',(word,0,i*24,num-i,how_many)) 262  263                     self.dbcurr.execute( ' update info SET status=1 WHERE id=%s ',(id))  # 置为已经访问 264  265                     self.dbconn.commit() 266  267         self.unlock() 268  269                  270  271              272  273      def start_work(self,req_max): 274  275          for item  in xrange(req_max): 276  277             t = threading.Thread(target=self.work, args=(item,)) 278  279             t.setDaemon(True) 280  281             t.start() 282  283              284  285      def lock(self):  # 加锁 286  287         self.mutex.acquire() 288  289   290  291      def unlock(self):  # 解锁 292  293         self.mutex.release() 294  295   296  297      def get_para(self,url,key): 298  299         values = url.split( ' ? ')[-1] 300  301          for key_value  in values.split( ' & '): 302  303             value=key_value.split( ' = ') 304  305              if value[0]==key: 306  307                  return value[1] 308  309          return None   310  311      312  313      def makeDateFolder( self,par,child): 314  315          # self.lock() 316  317          if os.path.isdir( par ): 318  319             path=par +  ' // ' + GetDateString() 320  321             newFolderName = path+ ' // '+child 322  323              if  not os.path.isdir(path): 324  325                 os.mkdir(path) 326  327              if  not os.path.isdir( newFolderName ): 328  329                 os.mkdir( newFolderName ) 330  331              return newFolderName 332  333          else: 334  335              return par  336  337          # self.unlock() 338  339          340  341      def parse_json(self,data): 342  343          344  345         ipdata = json.loads(data) 346  347          try: 348  349              if ipdata[ ' imgs ']:   350  351                  for n  in ipdata[ ' imgs ']:  # data子项  352  353                      if n[ ' objURL ']:   354  355                          try: 356  357                             proxy_support = urllib2.ProxyHandler(proxy) 358  359                             opener = urllib2.build_opener(proxy_support) 360  361                             urllib2.install_opener(opener) 362  363                              # print "proxy",proxy 364  365                             self.lock() 366  367                             self.dbcurr.execute( ' select ID from pic_info where objURL=%s ', (n[ ' objURL '])) 368  369                             y = self.dbcurr.fetchone() 370  371                              # print "y=",y 372  373                              if y: 374  375                                  print  " database exist " 376  377                                 self.unlock()  # continue 前解锁 378  379                                  continue 380  381                              else: 382  383                                 real_extension=utils.get_extension(n[ ' objURL ']) 384  385                                 req = urllib2.Request(n[ ' objURL '],headers=i_headers) 386  387                                 resp = urllib2.urlopen(req,None,5) 388  389                                 dataimg=resp.read() 390  391                                 name=str(uuid.uuid1()) 392  393                                 filename= "" 394  395                                  if len(real_extension)>4: 396  397                                     real_extension= " .gif " 398  399                                 real_extension=real_extension.lower() 400  401                                  if real_extension== " .gif ": 402  403                                     filename  =self.makeDateFolder( " E://sosogif "" d "+str(self.count % 60))+ " // "+name+ " -www.sosogif.com-搜搜gif贡献 "+real_extension 404  405                                     self.count+=1 406  407                                  else: 408  409                                     filename  =self.makeDateFolder( " E://sosogif "" o "+str(self.count % 20))+ " // "+name+ " -www.sosogif.com-搜搜gif贡献 "+real_extension 410  411                                     self.count+=1 412  413                                  """ 414  415                                  name=str(uuid.uuid1()) 416  417                                  filename="" 418  419                                  if len(real_extension)>4: 420  421                                      real_extension=".gif" 422  423                                  filename  =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension 424  425                                  self.count+=1  426  427                                   """ 428  429                                  try430  431                                      if  not os.path.exists(filename):  432  433                                         file_object = open(filename, ' w+b ')   434  435                                         file_object.write(dataimg)   436  437                                         file_object.close() 438  439                                         self.anaylis_info(n,filename,real_extension)  # 入库操作 440  441                                      else: 442  443                                          print  " file exist "  444  445                                  except IOError,e1:   446  447                                      print  " e1= ",e1 448  449                                      pass 450  451                             self.unlock() 452  453                          except IOError,e2:   454  455                              # print "e2=",e2  456  457                              pass   458  459                             self.chance1+=1 460  461          except Exception as parse_error: 462  463              print  " parse_error ",parse_error 464  465              pass 466  467      468  469      def title_dealwith(self,title): 470  471          472  473          # print "title",title 474  475         a=title.find( " <strong> ") 476  477         temp1=title[0:a] 478  479         b=title.find( " </strong> ") 480  481         temp2=title[a+8:b] 482  483         temp3=title[b+9:len(title)] 484  485          return (temp1+temp2+temp3).strip() 486  487          488  489      def anaylis_info(self,n,filename,real_extension): 490  491          print  " success. " 492  493          494  495          # if self.wait_ana_queue.qsize()!=0: 496  497              # n,filename,real_extension=self.wait.ana_queue.get() 498  499          # self.lock() 500  501         objURL=n[ ' objURL '# 图片地址 502  503         fromURLHost=n[ ' fromURLHost '# 来源网站 504  505         width=n[ ' width ']   # 宽度 506  507         height=n[ ' height '# 高度 508  509         di=n[ ' di '# 用来唯一标识 510  511         type=n[ ' type '# 格式 512  513         fromPageTitle=n[ ' fromPageTitle '# 来自网站 514  515         keyword=self.title_dealwith(fromPageTitle) 516  517         cs=n[ ' cs '# 未知 518  519         os=n[ ' os '# 未知 520  521         temp = time.time() 522  523         x = time.localtime(float(temp)) 524  525         acTime = time.strftime( " %Y-%m-%d %H:%M:%S ",x)  # 爬取时间 526  527         self.dbcurr.execute( ' select ID from pic_info where cs=%s ', (cs)) 528  529         y = self.dbcurr.fetchone() 530  531          if  not y: 532  533              print  ' add pic ',filename 534  535             self.commit_count+=1 536  537             self.dbcurr.execute( ' INSERT INTO pic_info(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ',(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension)) 538  539              if self.commit_count==10: 540  541                 self.dbconn.commit() 542  543                 self.commit_count=0 544  545          # self.unlock() 546  547             548  549   550  551      def format_top_url(self,word,pn,rn): 552  553   554  555         url = TOP_URL.format(word=word, pn=pn,rn=rn).encode( ' utf-8 '556  557          return url 558  559   560  561      def how_many(self,data): 562  563          try: 564  565             ipdata = json.loads(data) 566  567              if ipdata[ ' displayNum ']>0: 568  569                 how_many=ipdata[ ' displayNum '] 570  571                  return int(how_many) 572  573              else: 574  575                  return 0 576  577          except Exception as e: 578  579              pass 580  581          582  583      def get_pic(self): 584  585          """ 586  587          word="gif" 588  589          pn=0 590  591          rn=24 592  593          if self.key_word_queue.qsize()!=0: 594  595              word,pn,rn=self.key_word_queue.get() 596  597          url=self.format_top_url(word,pn,rn) 598  599          global proxy 600  601          if url: 602  603              try: 604  605                  html="" 606  607                  try: 608  609                      req = urllib2.Request(url,headers=i_headers) 610  611                      response = urllib2.urlopen(req, None,5) 612  613                      #print "url",url 614  615                      html = self.browser.openurl(url).read() 616  617                  except Exception as err: 618  619                      print "err",err 620  621                      #pass 622  623                  if html: 624  625                      how_many=self.how_many(html) 626  627                      #how_many=10000 628  629                      print "how_many",how_many 630  631                      word=self.get_para(url,"word") 632  633                      rn=int(self.get_para(url,"rn")) 634  635                      t=math.ceil(how_many/rn) 636  637                      num = int(t) 638  639                      for item  in xrange(0,num-1): 640  641           """ 642  643          try: 644  645              global proxy 646  647              print  " size of queue ",self.request_queue.qsize() 648  649              if self.request_queue.qsize()!=0: 650  651                 id,word,page_num = self.request_queue.get()             652  653                 u=self.format_top_url(word,page_num,24) 654  655                 self.lock() 656  657                 self.dbcurr.execute( ' update info SET status=1 WHERE id=%s ',(id)) 658  659                 self.dbconn.commit() 660  661                  if self.chance >0  or self.chance1>1:  # 任何一个出问题都给换代理 662  663                      if self.ID % 100==0: 664  665                         self.dbcurr.execute( " select count(*) from proxy ") 666  667                          for r  in self.dbcurr: 668  669                             count=r[0] 670  671                          if self.ID>count: 672  673                             self.ID=50 674  675                     self.dbcurr.execute( " select * from proxy where ID=%s ",(self.ID)) 676  677                     results = self.dbcurr.fetchall() 678  679                      for r  in results: 680  681                         protocol=r[1] 682  683                         ip=r[2] 684  685                         port=r[3] 686  687                         pro=(protocol,ip+ " : "+port) 688  689                          if pro  not  in self.next_proxy_set: 690  691                             self.next_proxy_set.add(pro) 692  693                     self.chance=0 694  695                     self.chance1=0 696  697                     self.ID+=1 698  699                 self.unlock()  700  701                 proxy_support = urllib2.ProxyHandler(proxy) 702  703                 opener = urllib2.build_opener(proxy_support) 704  705                 urllib2.install_opener(opener) 706  707                 html= "" 708  709                  try: 710  711                     req = urllib2.Request(u,headers=i_headers) 712  713                      # print "u=",u 714  715                     response = urllib2.urlopen(req, None,5) 716  717                     html = response.read() 718  719                      if html: 720  721                          # print "html",type(html) 722  723                         self.parse_json(html) 724  725                  except Exception as ex1: 726  727                      # print "error=",ex1 728  729                      pass 730  731                     self.chance+=1 732  733                      if self.chance>0  or self.chance1>1: 734  735                          if len(self.next_proxy_set)>0: 736  737                             protocol,socket=self.next_proxy_set.pop() 738  739                             proxy= {protocol:socket} 740  741                              print  " change proxy finished<< ",proxy,self.ID 742  743          except Exception as e: 744  745              print  " error1 ",e 746  747              pass 748  749              750  751  if  __name__ ==  ' __main__ ': 752  753   754  755     app = BaiduImage()  756  757     app.start_work(80) 758  759      # app.generateSeed() 760  761      while 1: 762  763          pass

 

 

 

 

 

 

 

 

转载于:https://www.cnblogs.com/jym-sunshine/p/5476900.html

相关资源:利用Python爬虫批量下载百度图库图片
最新回复(0)