
| import glob import docx import string import os from pathlib import Path import re import hashlib import xlsxwriter ''' Author :jdr Date: 2022-11-09 Usage: 将脚本和docx放在同目录,执行python3 demo.py即可。 ''' #获取文档属性 def getWordData(doc): author = createTime = Editor = endTime = time = '0' #变量初始化,防止出现null的情况 data =[] try: author = doc.core_properties.author #文档创建者 createTime = doc.core_properties.created.strftime('%Y-%m-%d %H:%M:%S') #文档创建时间(datetime数据类型) Editor = doc.core_properties.last_modified_by #文档修改者 endTime = doc.core_properties.modified.strftime('%Y-%m-%d %H:%M:%S') #文档修改时间(datetime数据类型) time = (doc.core_properties.modified - doc.core_properties.created) #创建时间和修改时间的差值(datetime数据类型) except: print("null") data.append(author) data.append(createTime) data.append(Editor) data.append(endTime) data.append(time) return data #获取文档的所有文本字符 def getWordStrs(doc): text = "" for j in doc.paragraphs: text += j.text return text #字符计数 def count_str(strs): #中文、英文、空格、数字、其他字符 count_Chinese = count_English = count_Space = count_digit = count_Other =0 data = [] for s in strs: # 英文 if s in string.ascii_letters: count_English += 1 # 数字 elif s.isdigit(): count_digit += 1 # 空格 elif s.isspace(): count_Space += 1 # 中文 elif s.isalpha(): count_Chinese += 1 # 其他字符 else: count_Other += 1 data.append(str(count_English)) data.append(str(count_digit)) data.append(str(count_Space)) data.append(str(count_Chinese)) data.append(str(count_Other)) return data #抓取word文件中的图片 def fetch_image(doc_path, desc_path): img_list = [] if Path(desc_path).is_dir(): pass else: os.mkdir(desc_path) doc = docx.Document(doc_path) dict_rel = doc.part._rels #rels其实是个目录 for rel in dict_rel: rel = dict_rel[rel] #print("rel",rel.target_ref) if "image" in rel.target_ref: # create_dir(desc_path) img_name = re.findall("/(.*)", rel.target_ref)[0] #windos:/ #print("img_name",img_name) word_name = os.path.splitext(doc_path)[0] #print("word_name",word_name) if os.sep in word_name: new_name = word_name.split('\\')[-1] else: new_name = word_name.split('/')[-1] img_name = f'{new_name}_{img_name}' img_list.append(img_name) with open(f'{desc_path}/{img_name}', "wb") as f: f.write(rel.target_part.blob) #计算文件的md5 def get_file_md5(file_name): m = hashlib.md5() with open(file_name,'rb') as fobj: while True: data = fobj.read(4096) if not data: break m.update(data) return m.hexdigest() #图片计数 def ImgCount(imgDir): count = 0 file_list=os.listdir(imgDir) for i in file_list: img = imgDir + "\\" + i if os.path.isfile(img): count = count + 1 return count #学生文档信息表 def createStudentInfoResultXlsx(workbook): worksheet = workbook.add_worksheet('学生文档信息') bold=workbook.add_format(({'bold':True})) #添加表头 worksheet.write("A1","文件名",bold) worksheet.write("B1","作者",bold) worksheet.write("C1","创建时间",bold) worksheet.write("D1","修改人",bold) worksheet.write("E1","最后修改时间",bold) worksheet.write("F1","图片总数",bold) worksheet.write("G1","字符总数",bold) worksheet.write("H1","英文字符总数",bold) worksheet.write("I1","数字字符总数",bold) worksheet.write("J1","空格字符总数",bold) worksheet.write("K1","中文字符总数",bold) worksheet.write("L1","特殊字符总数",bold) worksheet.write("M1","耗时",bold) return worksheet #学生文档图片信息表 def createImgInfoResultXlsx(workbook): worksheet = workbook.add_worksheet('学生文档图片信息') bold=workbook.add_format(({'bold':True})) #添加表头 worksheet.write("A1","文件名",bold) worksheet.write("B1","图片md5",bold) return worksheet def getFiles(dir): return os.listdir(dir) if __name__=='__main__': docx_file_list = glob.glob("*.docx") imagesDir = "images" #图片的保存目录 if Path(imagesDir).is_dir(): pass else: os.mkdir(imagesDir) #创建xlsx workbook = xlsxwriter.Workbook('results.xlsx') #创建学生信息工作薄 worksheetSt = createStudentInfoResultXlsx(workbook) #创建图片信息工作薄 worksheetImg = createImgInfoResultXlsx(workbook) # 数据表格偏移 row,col=1,0 for docxFile in docx_file_list: docxName = docx.Document(docxFile) #声明docx对象 imgDir = docxFile.replace(".docx","")#学生目录名称 fetch_image(docxFile,imagesDir+"/"+imgDir) #获取文档中的图片 doc_info = getWordData(docxName) #文档属性 imgLen = str(len(os.listdir(imagesDir+"/"+imgDir))) #学生目录下的图片个数 #往学生文档信息工作薄追加数据 worksheetSt.write(row,col,docxFile)#填第一行第一列的文件名 worksheetSt.write(row,col+1,doc_info[0])#填第一行第二列的作者 worksheetSt.write(row,col+2,doc_info[1])#填第一行第三列的创建时间 worksheetSt.write(row,col+3,doc_info[2])#填第一行第四列的修改人 worksheetSt.write(row,col+4,doc_info[3])#填第一行第五列的最后修改时间 worksheetSt.write(row,col+5,imgLen)#填第一行第六列的图片总数 worksheetSt.write(row,col+6,str(len(getWordStrs(docxName))))#填第一行第七列的总字符数 worksheetSt.write(row,col+7,count_str(getWordStrs(docxName))[0])#填第一行第八列的英文字符总数 worksheetSt.write(row,col+8,count_str(getWordStrs(docxName))[1])#填第一行第九列的数字字符总数 worksheetSt.write(row,col+9,count_str(getWordStrs(docxName))[2])#填第一行第十列的空格字符总数 worksheetSt.write(row,col+10,count_str(getWordStrs(docxName))[3])#填第一行第八列的中文字符总数 worksheetSt.write(row,col+11,count_str(getWordStrs(docxName))[4])#填第一行第九列的特殊字符总数 worksheetSt.write(row,col+12,doc_info[4])#填第一行第十列的耗时 row +=1 #往学生文档图片信息工作薄追加数据 row2 =1 for i in getFiles(imagesDir): for j in getFiles(imagesDir+"/"+i): worksheetImg.write(row2,col,i) #填第一行第一列的文件名 worksheetImg.write(row2,col+1,get_file_md5(imagesDir+"/"+i+"/"+j))#填第一行第二列的MD5 row2+=1 workbook.close()
|