当前位置 博文首页 > 广大菜鸟的博客:基于python读写word的合并多个不同格式文档的需
背景是这样的:
老师让我负责收同学们的实验报告,然后有些同学把代码用不同格式文件存储(.cpp,.py,.java,txt),然后同学们写报告有的用(.wps,.doc,.docx等),这下就麻烦了,因为一个学期下来,有几百份实验报告要处理,复制粘贴太累,不如写代码放松下,毕竟刚考完试。
import os
import chardet
import send2trash
import shutil
from win32com.client import Dispatch
rootPath = r'C:\Users\Lenovo\Desktop\test' # 一个班的根目录
# 获取文件夹内文件的实验报告名字
def getDocName(rootPath, filenamelist):
for item in filenamelist:
fileFormat = os.path.splitext(rootPath + "/" + item)[-1]
if fileFormat in ['.docx', '.doc', '.wps']:
return item
return ''
# 将文件夹内不同文件的内容复制到word格式文件内
def otherFilesAppend2Doc(rootPath, docFileName, otherfileNameList):
topLevelPath = os.path.dirname(rootPath)
newFileName = os.path.join(topLevelPath, docFileName)
shutil.copy(os.path.join(rootPath, docFileName), newFileName) # 覆盖
word = Dispatch('Word.Application') # 打开word应用程序
word.Visible = 0 # 后台运行,不显示
word.DisplayAlerts = 0 # 不警告
for item in otherfileNameList:
singleFileName = os.path.join(rootPath, item)
# 首先二进制方式打开文件
with open(singleFileName, 'rb') as frb:
# 检测编码方式
cur_encoding = chardet.detect(frb.read())['encoding']
# 指定文件编码方式
with open(singleFileName, 'r', encoding=cur_encoding) as f2:
content = f2.read()
if not os.path.exists(newFileName):
open(newFileName, 'w').close()
doc = word.Documents.Open(FileName=newFileName, Encoding='gbk')
myRange = doc.Range()
myRange.InsertAfter('\n')
myRange.InsertAfter(content)
doc.SaveAs(newFileName)
doc.Close()
word.Quit()
send2trash.send2trash(rootPath) # 暂时移动到垃圾箱,方便检查结果
def main():
if not os.path.isdir(rootPath):
print('请输入文件夹目录')
return
fileList = os.listdir(rootPath)
for item in fileList:
concretePath = os.path.join(rootPath, item)
if os.path.isfile(concretePath): # 只处理文件夹内文件的合并
continue
tmpList = os.listdir(concretePath)
if len(tmpList) == 0:
print(concretePath, '无文件')
continue
docName = getDocName(concretePath, tmpList)
if docName == '':
print(concretePath + "缺失实验报告")
continue
tmpList.remove(docName)
otherFilesAppend2Doc(concretePath, docName, tmpList)
main()
原始状态:(测试阶段每个文件以文件名字测试)
实现合并后:
# 首先二进制方式打开文件
with open(singleFileName, 'rb') as frb:
# 检测编码方式
cur_encoding = chardet.detect(frb.read())['encoding']
# 指定文件编码方式
with open(singleFileName, 'r', encoding=cur_encoding) as f2:
content = f2.read()
send2trash.send2trash(rootPath)
from docx import Document
from win32com.client import Dispatch
import chardet
import os
# 1、只能读.docx文件
def readFromDocx(src):
document = Document(src) # 打开文件demo.docx
for paragraph in document.paragraphs:
print(paragraph.text)
# 1.1、读多种word文件
def readFromWord(src):
word = Dispatch('Word.Application') # 打开word应用程序
word.Visible = 0 # 后台运行,不显示
word.DisplayAlerts = 0 # 不警告
doc = word.Documents.Open(FileName=src, Encoding='gbk')
for para in doc.paragraphs:
print(para.Range.Text)
doc.Close()
word.Quit()
# 2、只能覆盖写docx文件,但可以不覆盖写别的wps,doc等文件
def write2Word(src, dest, mode=True):
"""
:param src: 其他格式文件的路径
:param dest: word格式的文件路径
:param mode: 是否需要覆盖
:return:
"""
# 首先二进制方式打开文件
with open(src, 'rb') as frb:
# 检测编码方式
cur_encoding = chardet.detect(frb.read())['encoding']
# 指定文件编码方式
with open(src, 'r', encoding=cur_encoding) as f2:
content = f2.read()
if not mode and os.path.exists(dest):
document = Document(dest)
else:
# 首先创建一个文档对象
document = Document()
document.add_paragraph(content)
# 将文档保存到docx中
document.save(dest)
# 2.1、写多种word文件
def write2Word2(src, dest,mode=True):
"""
:param src: 其他格式文件的路径
:param dest: word格式的文件路径
:param mode: 是否需要覆盖
:return:
"""
# 首先二进制方式打开文件
with open(src, 'rb') as frb:
# 检测编码方式
cur_encoding = chardet.detect(frb.read())['encoding']
# 指定文件编码方式
with open(src, 'r', encoding=cur_encoding) as f2:
content = f2.read()
if not os.path.exists(dest):
with open(dest, 'w') as f:
f.close()
word = Dispatch('Word.Application') # 打开word应用程序
word.Visible = 0 # 后台运行,不显示
word.DisplayAlerts = 0 # 不警告
if not mode:
doc = word.Documents.Open(FileName=dest, Encoding='gbk')
myRange = doc.Range()
myRange.InsertAfter(content)
myRange.InsertAfter('\r\n')
else:
doc = word.Documents.Add()
myRange = doc.Range(0, 0)
myRange.InsertBefore(content)
myRange.InsertAfter('\r\n')
doc.SaveAs(dest)
doc.Close()
word.Quit()
cs