从一个 html 文件已经读取表格数据进入到 pd
我准备把 pd 数据写入到一个 html 文件的时候,报了个错误,遇上了困扰了很多人编码类型的问题,
error in main!! 'gbk' codec can't decode byte 0x94 in position 256: illegal multibyte sequence
SomeFunc.py 里面的 transformCodec()是网上找的一个例子,好像不能正常工作,我理解是 2.x 版本写的,3.x 不合适?
import requests
import pandas as pd
import string
import time
import datetime
import codecs
import SomeFunc
srcfile = 'C:/Users/Administrator/Desktop/src.html'
if __name__ == '__main__':
try:
f = codecs.open(srcfile, 'r+', 'utf-8')
readStr = f.read()
f.close()
PageDF = PageDF.append(pd.read_html(readStr)[7], ignore_index=True)
PageDF = PageDF.drop(PageDF.tail(1).index) #删除最后一行
#PageDF = SomeFunc.transformCodec(PageDF)
PageDF.to_html('C:/Users/Administrator/Desktop/pcResult/A.html')
except Exception as e:
print('error in main!!', e)
#############################
SomeFunc.py #
#############################
def transformCodec(re_data):
try:
re_data = re_data.decode('gbk')
except Exception as error:
print("error")
print('delete illegal string,try again...')
pos = re.findall(r'decodebytesinposition([\d]+)-([\d]+):illegal', str(error).replace(' ', ''))
if len(pos) == 1:
re_data = re_data[0:int(pos[0][0])] + re_data[int(pos[0][1]):]
re_data = transformCodec(re_data)
return re_data
return re_data