#!/usr/bin/env python"""Process a directory, find all text files from the current directory and its subdirectories recursively and extract plain text from each file and create and output file in the corresponding output directory."""__version__="0.1"__author__="Duy Dinh"__copyright__="(C) 20014-2014 CRP Henri Tudor."__contributors__=["Duy Dinh"]# ------------------------------------IMPORT --------------------------------------------------------------# import built-in librariesimportitertoolsasitimportsys,time,os,fnmatchimportoptparse# parse options and arguments# import (attached user-defined) libraries or modulesimportshutilsimporthtml2text# import classes from librariesfromsysimportstdout# --------------------------------------------------------------------------------------------------------------# ----------------------------GLOBAL VARIABLES--------------------------------------------------# program optionsPROCESSED_DOCUMENTS=0# --------------------------------------------------------------------------------------------------------------# Python class nameclassextractPlainText:pass# initialize optionsoptions=extractPlainText()# find files in a directory given a file patterndeffind_files(directory,pattern):forroot,dirs,filesinos.walk(directory):forbasenameinfiles:iffnmatch.fnmatch(basename,pattern):filename=os.path.join(root,basename)yieldfilename# process each file founddefprocessFile(fn,output,encoding):# input file name, output file namebaseurl=Noneiffn.startswith('http://')orfn.startswith('https://'):baseurl=file_j=urllib.urlopen(baseurl)text=j.read()ifencodingisNone:try:fromfeedparserimport_getCharacterEncodingasencexceptImportError:enc=lambdax,y:('utf-8',1)encoding=enc(j.headers,text)[0]ifencoding=='us-ascii':encoding='utf-8'data=text.decode(encoding)else:data=open(fn,'rb').read()ifencodingisNone:try:fromchardetimportdetectexceptImportError:detect=lambdax:{'encoding':'utf-8'}encoding=detect(data)['encoding']data=data.decode(encoding)text=html2text.html2text(data,baseurl)#wrapwrite(text)text=text.encode('utf-8')# write to output fileif(outputisnotNone):print"**** Writing plain text to file '%s'"%outputhtml2text.writeToFile(text,output)else:printtext# get the sub directory name comparing outputBaseDir and inputBaseDirdefgetSubDirectoryName(inputBaseDir,outputBaseDir,inputFile):# original base directorysubDirName=None# find common string between input Base Dir and output Base DircommonDir=''.join(el[0]forelinit.takewhile(lambdat:t[0]==t[1],zip(inputBaseDir,outputBaseDir)))#print commonDir# find the difference string between input base dir and current input dir of fileinputDir=os.path.dirname(inputFile)if(commonDir<>inputDir):start=len(commonDir)end=len(inputDir)subDirName=inputDir[start:end]# end if returnsubDirName# create the hierarchical folder structure for output defmakeOutputFile(inputFile,subDirectory,outputDir):# get file namefilename=os.path.basename(inputFile)if(subDirectoryisnotNone):outputDir=outputDir+subDirectoryif(notos.path.exists(outputDir)):print"Making directory '%s'"%outputDiros.makedirs(outputDir)outputFileName=outputDir+"/"+filenameelse:outputFileName=outputDir+"/"+filenamereturnoutputFileName# check input optionsdefcheckOptions(inputDirectory,outputDirectory,encoding,file_pattern):valid=1globalPROCESSED_DOCUMENTSPROCESSED_DOCUMENTS=0if(inputDirectoryisnotNoneandnotos.path.exists(inputDirectory)):valid=0print"*** Input directory '%s' does not exist!"%inputDirectory# end if if(outputDirectoryisnotNone):if(notos.path.exists(outputDirectory)):valid=0print"*** Output directory '%s' does not exists!"%outputDirectoryprint"Would you like to create output directory ? (y|n)"line=sys.stdin.readline()line=line.strip()#print lineif(lineis"y"orlineis"Y"):os.makedirs(outputDirectory)valid=1# end if else:# compute the number of documents that have been already processedforinput_fileinfind_files(outputDirectory,file_pattern):PROCESSED_DOCUMENTS=PROCESSED_DOCUMENTS+1if(PROCESSED_DOCUMENTS>0):print"There are %d documents already processed!"%PROCESSED_DOCUMENTSprint"Would you like to resume the process? (y|n)"line=sys.stdin.readline()line=line.strip()if(not(lineis"y"orlineis"Y")):print"Remove output directory '%s' ? All content from this folder will be deleted. Please confirm deletion (y|n)"%outputDirectoryline=sys.stdin.readline()line=line.strip()if(lineis"y"orlineis"Y"):shutils.rmtree(outputDirectory)os.makedirs(outputDirectory)# reset the number of processed documentsPROCESSED_DOCUMENTS=0#end if# end if # end if # end if # end if #print validreturnvaliddefprintCompletionTime(startTime,endTime):time_taken=endTime-startTimehours,rest=divmod(time_taken,3600)minutes,seconds=divmod(rest,60)hLabel="hour"mLabel="minute"sLabel="second"strFormat="%d%s"stdout.write("Completion time: ")if(hours>=1):hLabel=hLabel+"s"stdout.write("%d%s"%(hours,hLabel))if(minutes>=1):mLabel=mLabel+"s"stdout.write("%d%s"%(minutes,mLabel))if(seconds>=1):sLabel=sLabel+"s"stdout.write("%d%s"%(seconds,sLabel))# mainif__name__=="__main__":# build optionsp=optparse.OptionParser('%prog [-i|--input] [INPUTDIRECTORY] [-o|--output OUTPUTDIRECTORY] [-e|--encoding ENCODING] [-p|pattern FILEPATTERN]',version='%prog '+__version__)p.add_option("-i","--input",help="Input directory")p.add_option("-o","--output",help="output directory")p.add_option("-e","--encoding",default="utf8",help="file encoding, e.g., 'ascii', 'utf8', etc.")p.add_option("-p","--pattern",default="*.*",help="file pattern, e.g., '*.txt', '*.xml', etc.")#p.add_option("-h", "--help", help="print this help message and exit")# parse options and arguments(options,args)=p.parse_args()if(len(args)>=1):# analyze optionsinputDirectory=args[0]if(inputDirectoryisNone):inputDirectory=options.inputencoding=options.encodingfile_pattern=options.patternoutputDirectory=options.outputif(notcheckOptions(inputDirectory,outputDirectory,encoding,file_pattern)):p.error("Please verify the options and arguments!!!")if(notinputDirectory.endswith("/")):inputDirectory=inputDirectory+"/"if(outputDirectoryisnotNone):if(notoutputDirectory.endswith("/")):outputDirectory=outputDirectory+"/"# end analyze options# Recursively find all extension (e.g. *.txt, *.pdf) files in input directory totalDocs=0forinput_fileinfind_files(inputDirectory,file_pattern):totalDocs=totalDocs+1counter=0startTime=time.time()forinput_fileinfind_files(inputDirectory,file_pattern):if(counter<=PROCESSED_DOCUMENTS):counter=counter+1;continue# process each fileprint"[%d] Extracting plain text from file '%s'"%(counter,input_file)# create output fileoutput_file=Noneif(outputDirectoryisnotNone):subDirectoryName=getSubDirectoryName(inputDirectory,outputDirectory,input_file)output_file=makeOutputFile(input_file,subDirectoryName,outputDirectory)processFile(input_file,output_file,encoding)if(counter%1000==0):print"**** Task progress : [%.2f%%]"%((100.0*counter)/totalDocs)time.sleep(2)counter=counter+1# end forprint"%d/%s files processed."%(counter,totalDocs)endTime=time.time()printCompletionTime(startTime,endTime)else:p.print_help()p.error('*** Input directory must be specified!!')
Last edit: Duy Dinh 2014-05-08