pymupdf提取pdf表格非常快速,相比其他库是个更好的选择.
一个行列多的表格打印成pdf后会由于页宽分页原因变成多个表格,提取的多个表格需要合并为一个表格,再来处理数据.
下面代码中merge函数用于合并表格.addOneAxis0用于合并仅分页导致的多子表.
- def addOneAxis0(data,one):
- if len(data)==0:#first
- r=one
- else:
- r=data+one[1:]#remove first title row
- return r
-
- def merge(data):
- output=data[0]
- titles=data[0][0][1:]
- for i in range(1,len(data)):
- newtitle=data[i][0][1:]
- # print(titles)
- # print(newtitle)
- # input("here")
- if newtitle[0] in titles:#repeat
- for one in data[i][1:]:#add row
- output.append(one)
- else:#
- if newtitle[0] in output[0]:#add column without title
- thedata=data[i][1:]#remove title
- n=len(thedata)
- out=output[-n:]
- for j in range(len(out)):
- out[j]+=thedata[j][1:]
- pass
- else:#add column with title
- n=len(data[i])
- out=output[-n:]
- for j in range(len(out)):#add column
- out[j]+=data[i][j][1:]
- pass
- return output
- def getDataMass(file_name):
- global doc,curvePage
- doc=fitz.open(file_name) # open document
- data=[]
- mass=[]
- tables=[]
- i=None
- for i in range(doc.page_count).__reversed__():
- page = doc[i]
- tabs=page.find_tables()
- if len(tabs.tables)==0:
- break
- else:
- tables.append(tabs[0].extract())
- tables.reverse()
- for one in tables:
- if one[0][0]=="溶液标签":
- data.append(one)#data.append(one)
- else:
- mass=addOneAxis0(mass,one)
- curvePage=i
- print(data)
- data=merge(data)
- print(data)
- return (data,mass)
'运行
