Python3自动化_文件批量处理(文本、PDF；读取、筛选、导出)

amazingbo

2020-05-08

利用Python3脚本语言的简练语法，高级语言的丰富类库，快速写了几个文件读取、筛选、导出的“脚本”。

这里简单总结一下关键功能。

读取ini配置文件

检查ini文件是否存在；检查输入的key在ini文件里是否有定义。

import configparser  2
def getConfigInfo(_ini_nm):

    # Open Ini File
    config = configparser.ConfigParser()
    if not config.read(os.path.join(os.getcwd(), _ini_nm + r‘.ini‘)):
        printLog(‘E‘, ‘Read Ini file fail.‘)

    while True:
        sysCode = input(r‘Please input the system code : (Press [Enter] to quit):‘).strip()
        if 0 == len(sysCode.strip()):
            exit()

        # Init ConnectionSettings
        if sysCode in config.sections():
            return dict(config[sysCode])
        else:
            print(‘Ini info of System [%s] is blank.\n‘ % sysCode)

多参数输入的获取

检查参数个数；检查参数合法性（长度，是否目录）；检查参数是否整个都是汉字。

def _main():

    path = ‘‘
    keyWord = ‘‘

    while True:
        para = input(r‘Please input the PDF directory and Key Word: (Press [Enter] to quit):‘).strip().split()

        if 2 != len(para): continue

        path = para[0]
        keyWord = para[1]

        if 0 == len(path.strip()):
            exit()

        if not os.path.exists(path):
            print(‘input path is not a exists path.‘ + ‘\n‘)
            continue

        flg = True
        for char in keyWord.strip():
            if char <= u‘\u4e00‘ or char >= u‘\u9fa5‘:
                flg = False
                break
        if not flg:
            print(‘Please input the Chinese Key Word for search.(Such as \‘物流\‘).‘ + ‘\n‘)
            continue

        break

PostgreSQL数据库处理

根据ini文件定义的数据库连接信息，尝试连库；执行SQL文。

import psycopg2  4 import traceback

def connDB(_cfg):
    try:
        conn = psycopg2.connect(database=_cfg[‘servicename‘],
                                user=_cfg[‘dbuser‘],
                                password=_cfg[‘dbpw‘],
                                host=_cfg[‘host‘],
                                port=_cfg[‘port‘])
        return conn
    except Exception:
        printLog(‘E‘, ‘Exception occur at DB Connection.‘ + ‘\n‘ + traceback.format_exc())

def executeSql(_cfg, _sql):
    try:
        conn = connDB(_cfg)
        cur = conn.cursor()
        cur.execute(_sql)

        results = cur.fetchall()
        return list(map(lambda x: x[0], results))
    except Exception:
        printLog(‘E‘, ‘Exception occur at Execute SQL.‘ + ‘\n‘ + traceback.format_exc())
    finally:
        cur.close()
        conn.rollback()
        conn.close()

日志处理

定义输出日志的级别；异常级别时，处理结束。

logging.basicConfig(filename=‘log_‘ + datetime.now().strftime(‘%Y%m%d‘) + ‘.txt‘,
                    level=logging.INFO,
                    format=‘ %(asctime)s - %(levelname)s - %(message)s‘)

logLevel = {‘D‘: logging.DEBUG,
            ‘I‘: logging.INFO,
            ‘W‘: logging.WARNING,
            ‘E‘: logging.ERROR,
            ‘C‘: logging.CRITICAL}

def printLog(_lvl, _msg):
    logging.log(logLevel[_lvl], _msg)
    if logging.ERROR == logLevel[_lvl]:
        print(_msg)
        exit()


printLog(‘E‘, ‘srcpath is not a exists path.‘)
printLog(‘I‘, ‘Get Src Path : %s‘ % srcPath)

MAP函数运用

列表元素批量处理，按第二个下划线字符截取字符串。

def getPreOfNm(x):
    if 1 < x.count(‘_‘):
        return x[0:x.find(‘_‘, x.find(‘_‘) + 1)]
    else:
        return x

# Get prefix of CRUD object name
prefixObjNm = list(set(map(getPreOfNm, lstTb)))
prefixObjNm.sort()

目录处理

目录/文件判断；目录的路径分割；完整路径的文件名取得；

# Check the srcPath
fullFilePaths = []
if os.path.isdir(srcPath):
    for folderName, subFolders, fileNames in os.walk(srcPath):
        if os.path.split(folderName)[1] in [‘tcs‘, ‘doc‘]: continue
        for fn in fileNames:
            # Get src file
            mObj = fileNmReg.search(fn)
            if mObj:
                fullFilePaths.append(os.path.join(folderName, fn))
elif os.path.isfile(srcPath):
    # Get src file
    fn = os.path.basename(os.path.realpath(srcPath))
    mObj = fileNmReg.search(fn)
    if mObj:
        fullFilePaths.append(srcPath)

PDF文件读取

来源：https://www.cnblogs.com/alexzhang92/p/11488949.html

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
import os


def read_pdf(pdf):
    # resource manager
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    # device
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    process_pdf(rsrcmgr, device, pdf)
    device.close()
    content = retstr.getvalue()
    retstr.close()
    # 获取所有行
    contents = str(content).split("\n")

    return contents

CSV文件导出

# Init result file
rstFile = open(os.path.join(srcPath, ‘[CRUD]‘ + datetime.now().strftime(‘%Y%m%d%H%M%S‘) + ‘.csv‘), ‘w‘, newline=‘‘)
rstWtr = csv.writer(rstFile, delimiter=‘\t‘, lineterminator=‘\n‘)
# Write head
rstWtr.writerow([‘TYPE‘, ‘CI‘, ‘ENCODE‘, ‘LINE NUM‘, ‘CRUD‘, ‘TABLE NM‘, ‘FULL PATH‘])

转载请注明原文链接，谢谢。

python3