# -- codingutf-8 --#authorzhl# from robot.api import logger# logger.info('提示信息打印', True,True)# logger.error('错误信息打印', True)
from html.parser import HTMLParserimport pandas as pd
import re
def html_tr_td(data) ##html文件预处理,处理tdtd标签内容为空的情况: data=data.replace(td,NULLtd)
class hp(HTMLParser) tr = False td = False flag=False allmx=[] temp=[]
def handle_starttag(self, tag, attr) if tag == 'tr' self.tr = True self.flag = False ##遇到一次tr开始标签时,设置为False
if tag== 'td' self.td = True
def handle_endtag(self, tag) if tag == 'tr' self.tr = False self.flag=True ##遇到一次tr结束标签时,设置为True if tag == 'td' self.td = False
def handle_data(self, data) if self.td and self.tr # print(data) if len(data) self.temp.append(data)
if self.flag##一行数据结束 self.allmx.append(self.temp) self.temp=[] yk = hp() yk.feed(data) yk.close() ##获取结果的二维数组 newlist=[] headlist=yk.allmx[0][3] newlist.append(headlist) print(len(yk.allmx)) i=1 for i in range(1,len(yk.allmx)-8) flag=True for j in range(len(yk.allmx[i])) if re.search(条记录,yk.allmx[i][j]) print('here') flag=False continue if flag == False continue if flag == False continue newlist.append(yk.allmx[i][4])
#df=pd.DataFrame(yk.allmx) df = pd.DataFrame(newlist) #print(df) name = df.to_excel(final.xls,index=None,header=None)
if __name__ == '__main__' ff = open(rCProgram Files (x86)Suning RobotStudioProjectstesttest.html, 'r', encoding=gbk) data = ff.read() html_tr_td(data)
转载于:https://www.cnblogs.com/jessitommy/p/11077210.html
相关资源:htmlToExcel.js