当前位置 博文首页 > 李长孺的专栏:python3 pdf 无边框 表格 进程池多进程 批量提取

    李长孺的专栏:python3 pdf 无边框 表格 进程池多进程 批量提取

    作者:[db:作者] 时间:2021-08-17 09:47

    # -*- coding: utf-8 -*-
    """
    Created on Wed Oct 21 09:30:07 2020
    
    @author: 元白
    """
    import camelot
    import pandas
    import time
    import os
    import glob
    import re
    import queue
    import threading
    import time
    
    #from multiprocessing import Pool
    import multiprocessing as mp
     
    
    
    def func_pdf_find(pdf_path):
        pdfs = glob.glob("{}/*.pdf".format(pdf_path))    
        return pdfs
    
    
    def get_table_form_pdf(pdf):
        #只从每个pdf的前6页中提取表格
        tables = camelot.read_pdf(pdf, pages='1-6', flavor='stream')
        return tables
    
    def put_table_to_cvs(tables, pdf):
        #从pdf文件名中,提取序号,非必须操作
        nums = pdf.split(".pdf")[0]
        y=float(nums.split("_")[1])
        z=float(nums.split("_")[3])
        
        #只提取前6个表格
        for i in range(0,6,2):  
                    df1 = tables[i].df.loc[0:9,1:10]
                    df2 = tables[i+1].df.loc[0:9,1:10]
                    
                    df3 = pandas.merge(df2, df1, left_index=True, right_index=True)
                    df3.loc[-1] = ['angle',y,z,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
                    df3.index = df3.index + 1
                    df3 = df3.sort_index()
                    
                    #debug 阶段先不开存储
                    df3.head(11).to_csv('dfdemo.csv', index=False, mode='a+', header=False)
        
        
    def job(d, l, idx, pdf):
        #耗时操作,从pdf提取表格,无所谓顺序,lock前操作
        tables = get_table_form_pdf(pdf)
        
        while True:        
            l.acquire()
            if d[1] == idx:
                print('pdf,d[1] / idx  :',d[1], idx)
                put_table_to_cvs(tables, pdf)
                d[1] += 1
                l.release()
                break
                
            l.release()
            time.sleep(0.01)
        return 1
    
        
    if __name__ == '__main__':
        
        with mp.Manager() as manager:
            time_start = time.time()
            pdfs = func_pdf_find('d:\python-demo')
            pool = mp.Pool(8) 
            lock = manager.Lock()
            d = manager.dict()
            d[1] = 1
            idx = 1   
            #print(pdfs)
            for pdf in pdfs:
                p_obj = pool.apply_async(job, args=(d,lock,idx,pdf))  # 异步执行进程
                idx += 1
                time.sleep(0.9)
            pool.close()  # 不再向进程池提交新的任务了
            pool.join()  # 进程池中的进程都执行完了
            time_end = time.time()
            print('used times : ',time_end - time_start)    
            
            
    
    
    
    cs