借助pdfplumber解析
效果如下:

{
‘发票号码(FPHM)’: ‘24322000000011529984’,
‘开票日期(KPRQ)’: ‘2024年01月11日’,
‘合计(HJ)’: ‘1205.94’,
‘购方’: ‘91320213586657279T’,
‘销方’: ‘91320214MAD1N7EN36’,
‘价税合计(JSHJ)’: ‘1218.00’,
‘项目(XM)-1’: ‘餐饮 11205.940594059405 1205.94 1% 12.06’
}

1、安装

pip install pdfplumber-i https://pypi.tuna.tsinghua.edu.cn/simple

2、全部代码

def getPdfText2(path):with pdfplumber.open(path) as pdf:page = pdf.pages[0]text = page.extract_text()print(text)# 提取发票表格上方内容invoice = {}ftype: int = 0# 取购销方纳税识别号方式1 是 纳税人识别号: 91320213586657279T ,2是只有18位数字item = re.search(r'发票号码(:|: |:)(\d+)', text)if item is not None:item = item.group()item = re.sub(r'发票号码(:|: |:)', '', item)item = item.replace(' ', '')invoice.update({"发票号码(FPHM)": item}) item = re.search(r'开票日期(:|: |:)(.*)', text)if item is not None:item = item.group()item = re.sub(r'开票日期(:|: |:)', '', item)item = item.replace(' ', '')invoice.update({"开票日期(KPRQ)": item}) item = re.search(r'机器编号(:|: |:)(\d+)', text)if item is not None:item = item.group()item = re.sub(r'机器编号(:|: |:)', '', item)item = item.replace(' ', '')invoice.update({"机器编号(JQBH)": item}) item = re.search(r'发票代码(:|: |:)(\d+)', text)if item is not None:item = item.group()item = re.sub(r'发票代码(:|: |:)', '', item)item = item.replace(' ', '')invoice.update({"发票代码(FPDM)": item}) item = re.search(r'校验码(:|: |:)(\d+)', text)if item is not None:item = item.group()item = re.sub(r'校验码(:|: |:)', '', item)item = item.replace(' ', '')invoice.update({"校验码(JYM)": item}) item = re.search(r'合(\s+)计(.*)', text)if item is not None:item = item.group()item = item.replace(' ', '').replace('合计', '')item = re.search(r'(\d+).(\d+)', item).group()item = item.replace('', '')invoice.update({"合计(HJ)": item}) # 购销方 纳税人识别号方式1 (纳税人识别号: 913202006829704176)items = re.findall(r'纳税人识别号(:|: |:)(\w+)', text)if len(items) >= 2:invoice.update({"购方": items[0][1]})invoice.update({"销方": items[1][1]})ftype = 1 if ftype == 0:# 购销方 纳税人识别号方式2 (只有18位数字)items = re.findall(r'[0-9A-HJ-NPQRTUWXY]{2}\d{6}[0-9A-HJ-NPQRTUWXY]{10}', text)if len(items) == 2:invoice.update({"购方": items[0]})invoice.update({"销方": items[1]})if len(items) >= 2:invoice.update({"购方": items[1]})invoice.update({"销方": items[2]}) item = re.search(r'(小写)(.*)', text)if item is not None:item = item.group()item = item.replace(' ', '').replace('小写)', '').replace('小写)', '')invoice.update({"价税合计(JSHJ)": item}) items = re.findall(r'\*[\u4e00-\u9fa5]+\*(.*)', text)i: int = 1for item in items:invoice.update({"项目(XM)-" + str(i): item})i = i + 1 print(invoice)