暑期---------0714爬虫

it2022-05-05  163

#0716------------ ''' import csv csvFile = open("csv test","w+") try: writer = csv.writer(csvFile) writer.writerow(('number','number plus 2','number times 3')) for i in range(10): writer.writerow((i,i+2,i*2)) finally: csvFile.close() ''' ''' import csv from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("https://www.runoob.com/html/html-tables.html") bs = BeautifulSoup(html,"html.parser") table = bs.findAll('table',{'class':'reference'})[0] rows = table.findAll('tr') csvFile = open("csv_test","wt+") writer = csv.writer(csvFile) try: for row in rows: csvRow = [] for cell in row.findAll(['td','th']): csvRow.append(cell.get_text()) writer.writerow(csvRow) finally: csvFile.close() ''' # use scraping ----创建事务 sql语句 ''' import pymysql,random,datetime,re from urllib.request import urlopen from bs4 import BeautifulSoup #创建链接 conn = pymysql.connect( host = "127.0.0.1", unix_socket = "/tmp/mysql.sock", user = "root", password = "root", db = "gly", charset = "utf-8" ) cur = conn.curser()#创建游标 cur.execute("use scraping")#执行 random.seed(datetime.datetime.now()) #存储 def store(title,content): cur.execute("insert into pages(title,content) values ('%s','%s'),(title,content)") cur.connection.commit() #爬数据 def getLinks(articleUrl): html = urlopen("http://en.wikipedia.org"+articleUrl) bs0bj = BeautifulSoup(html,"html.parser") title = bs0bj.find("h1").get_text() content = bs0bj.find("div",{"id":"mw-content-text"}).find("p").get_text() store(title,content) return bs0bj.find("div",{"id":"babyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$")) links = getLinks("/wiki/kevin_Bacon") try: while len(links) > 0: newArticle = links[random.randint(0,len(links)-1)].attrs["href"] print(newArticle) links = getLinks(newArticle) finally: cur.close() conn.close() ''' ''' import smtplib from email.mime.text import MIMEText def send_mail(username,passwd,recv,content): mailserver = "smtp.qq.com" username_send = username password = passwd username_recv = recv mail = MIMEText(content,_subtype = 'html',_charset = 'utf-8') mail['Subject'] = '工资条' mail['From'] = username_send mail['To'] = username_recv smtp = smtplib.SMTP(mailserver,port=587) smtp.login(username_send,password) smtp.sendmail(username_send,username_recv,mail.as_string()) smtp.quit() print("successful") send_mail("756347780@qq.com","15045491532q","756347780@qq.com","你好") ''' ''' import smtplib from email.mime.text import MIMEText def send_mail(username, passwd, recv, content): mailserver = "smtp.163.com" # 邮箱服务器地址 username_send = username # 邮箱用户名 password = passwd # 邮箱密码:需要使用授权码 username_recv = recv # 收件人,多个收件人用逗号隔开 mail = MIMEText(content, _subtype='html', _charset='utf-8') mail['Subject'] = '工资条' mail['From'] = username_send # 发件人 mail['To'] = username_recv # 收件人;[]里的三个是固定写法,别问为什么,我只是代码的搬运工 smtp = smtplib.SMTP(mailserver, port=25) # 连接邮箱服务器,smtp的端口号是25 smtp.login(username_send, password) # 登录邮箱 smtp.sendmail(username_send, username_recv, mail.as_string()) # 参数分别是发送者,接收者,第三个是把上面的发送邮件的内容变成字符串 smtp.quit() # 发送完毕后退出smtp print('发送成功') send_mail("15045491532@163.com","333355wo","15045491532@163.com","你好") ''' #读取纯文本 ''' from urllib.request import urlopen textPage = urlopen("http://www.pythonscraping.com/pages/warandpeace/chapter1-ru.txt") print(str(textPage.read(),'utf-8')) ''' #读取csv ''' from urllib.request import urlopen from io import StringIO import csv data = urlopen("http://www.pythonscraping.com/files/MontyPythonAlbums.csv")\ .read().decode("ascii","ignore") dataFile = StringIO(data) csvReader = csv.reader(dataFile) for row in csvReader: print(row) ''' #q去除头信息 ''' from urllib.request import urlopen from io import StringIO import csv data = urlopen("http://www.pythonscraping.com/files/MontyPythonAlbums.csv")\ .read().decode("ascii","ignore") dataFile = StringIO(data) csvReader = csv.DictReader(dataFile) print(csvReader.fieldnames) print("---------------") for row in csvReader: print(row) ''' #读取pdf----pdfminer(pdfminer3k) ''' from urllib.request import urlopen from pdfminer.pdfinterp import PDFResourceManager,process_pdf from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from io import StringIO from io import open def readPDF(pdfFile): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr,retstr,laparams=laparams) process_pdf(rsrcmgr,device,pdfFile) device.close() content = retstr.getvalue() retstr.close() return content pdfFile = urlopen("http://www.pythonscraping.com/pages/warandpeace/chapter1.pdf") outputString = readPDF(pdfFile) print(outputString) pdfFile.close() ''' #读取word .docx ''' from zipfile import ZipFile from urllib.request import urlopen from io import BytesIO #读取数据 wordFile = urlopen("http://www.pythonscraping.com/pages/AWordDocument.docx").read() wordFile = BytesIO(wordFile)#转字节 document = ZipFile(wordFile)#解压 xml_content = document.read('word/document.xml')#转xml格式 print(xml_content.decode("utf-8")) ''' #解析BeautifulSoup #w:t==========word标签 ''' from zipfile import ZipFile from urllib.request import urlopen from io import BytesIO from bs4 import BeautifulSoup wordFile = urlopen("http://www.pythonscraping.com/pages/AWordDocument.docx").read() wordFile = BytesIO(wordFile) document = ZipFile(wordFile) xml_content = document.read('word/document.xml') print(xml_content) #wordobj = BeautifulSoup(xml_content.decode("utf-8"),'xml') #testStrings = wordobj.findAll("w:t") #for textElem in testStrings: #print(textElem.text) '''

最新回复(0)