Duy Dinh - 2014-05-08
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#!/usr/bin/env python
"""Process a directory, find all text files from the current directory and its subdirectories recursively and extract plain text from each file and create and output file in the corresponding output directory."""
__version__ = "0.1"
__author__ = "Duy Dinh"
__copyright__ = "(C) 20014-2014 CRP Henri Tudor."
__contributors__ = ["Duy Dinh"]
# ------------------------------------IMPORT --------------------------------------------------------------
# import built-in libraries
import itertools as it
import sys,  time,  os,  fnmatch
import optparse # parse options and arguments

# import (attached user-defined) libraries or modules
import shutils
import html2text

# import classes from libraries
from sys import stdout
# --------------------------------------------------------------------------------------------------------------

# ----------------------------GLOBAL VARIABLES--------------------------------------------------
# program options
PROCESSED_DOCUMENTS = 0
# --------------------------------------------------------------------------------------------------------------

# Python class name
class extractPlainText: pass
# initialize options
options = extractPlainText()

# find files in a directory given a file pattern
def find_files(directory, pattern):
    for root, dirs, files in os.walk(directory):
        for basename in files:
            if fnmatch.fnmatch(basename, pattern):
                filename = os.path.join(root, basename)
                yield filename

# process each file found
def processFile(fn,  output,  encoding): # input file name, output file name
        baseurl = None
        if fn.startswith('http://') or fn.startswith('https://'):
            baseurl = file_
            j = urllib.urlopen(baseurl)
            text = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ('utf-8', 1)
                encoding = enc(j.headers, text)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
            data = text.decode(encoding)
        else:
            data = open(fn, 'rb').read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {'encoding': 'utf-8'}
                    encoding = detect(data)['encoding']
                    data = data.decode(encoding)
        text = html2text.html2text(data, baseurl)
        #wrapwrite(text)
        text = text.encode('utf-8')
        # write to output file
        if (output is not None):
            print "**** Writing plain text to file '%s'" % output
            html2text.writeToFile(text, output)
        else:
            print text

# get the sub directory name comparing outputBaseDir and inputBaseDir
def getSubDirectoryName(inputBaseDir, outputBaseDir,  inputFile): # original base directory
    subDirName = None

    # find common string between input Base Dir and output Base Dir
    commonDir = ''.join(el[0] for el in it.takewhile(lambda t: t[0] == t[1], zip(inputBaseDir, outputBaseDir))) 
    #print commonDir

    # find the difference string between input base dir and current input dir of file
    inputDir = os.path.dirname(inputFile)       
    if (commonDir <> inputDir):
        start=len(commonDir)
        end=len(inputDir)
        subDirName = inputDir[start:end]   
    # end if     
    return subDirName

# create the hierarchical folder structure for output 
def makeOutputFile(inputFile,  subDirectory,  outputDir):    
    # get file name
    filename=os.path.basename(inputFile)
    if (subDirectory is not None):
        outputDir = outputDir + subDirectory 
        if (not os.path.exists(outputDir)):
            print "Making directory '%s'" % outputDir
            os.makedirs(outputDir)

        outputFileName = outputDir + "/" + filename
    else:
        outputFileName = outputDir + "/" + filename        
    return outputFileName

# check input options
def checkOptions(inputDirectory,  outputDirectory,  encoding,  file_pattern):
    valid=1
    global PROCESSED_DOCUMENTS
    PROCESSED_DOCUMENTS = 0
    if (inputDirectory is not None and not os.path.exists(inputDirectory)):
        valid=0
        print "*** Input directory '%s' does not exist!" % inputDirectory        
    # end if     
    if (outputDirectory is not None):
        if (not os.path.exists(outputDirectory)):
            valid=0
            print "*** Output directory '%s' does not exists!" % outputDirectory
            print "Would you like to create output directory ? (y|n)"        
            line = sys.stdin.readline()
            line = line.strip()
            #print line
            if (line is "y" or line is "Y"):                
                os.makedirs(outputDirectory)               
                valid = 1
            # end if    
        else:
            # compute the number of documents that have been already processed
                for input_file in find_files(outputDirectory, file_pattern):
                    PROCESSED_DOCUMENTS = PROCESSED_DOCUMENTS + 1
                if (PROCESSED_DOCUMENTS > 0):    
                    print "There are %d documents already processed!" % PROCESSED_DOCUMENTS
                    print "Would you like to resume the process? (y|n)"
                    line = sys.stdin.readline()
                    line = line.strip()            
                    if (not (line  is "y" or line is "Y")):                    
                        print "Remove output directory '%s' ? All content from this folder will be deleted. Please confirm deletion (y|n)" % outputDirectory
                        line = sys.stdin.readline()
                        line = line.strip()
                        if (line is "y" or line is "Y"):
                            shutils.rmtree(outputDirectory)
                            os.makedirs(outputDirectory)    
                            # reset the number of processed documents
                            PROCESSED_DOCUMENTS = 0
                        #end if
                    # end if    
            # end if    
        # end if 
    # end if    
    #print valid
    return valid

def printCompletionTime(startTime,  endTime):
    time_taken = endTime - startTime
    hours, rest = divmod(time_taken,3600)
    minutes, seconds = divmod(rest, 60)
    hLabel = "hour"
    mLabel = "minute"
    sLabel = "second"
    strFormat = "%d %s"    
    stdout.write("Completion time: ")
    if (hours >= 1):
        hLabel = hLabel + "s"        
        stdout.write("%d %s" % (hours,  hLabel))
    if (minutes >= 1):
        mLabel = mLabel + "s"        
        stdout.write("%d %s" % (minutes,  mLabel)) 
    if (seconds >= 1):
        sLabel = sLabel + "s"
    stdout.write("%d %s" % (seconds,  sLabel))        
# main
if __name__ == "__main__":    
    # build options
    p = optparse.OptionParser('%prog [-i|--input] [INPUTDIRECTORY] [-o|--output OUTPUTDIRECTORY] [-e|--encoding ENCODING] [-p|pattern FILEPATTERN]', version='%prog ' + __version__)
    p.add_option("-i",  "--input",  help="Input directory")
    p.add_option("-o", "--output", help="output directory")
    p.add_option("-e",  "--encoding", default="utf8",  help="file encoding, e.g., 'ascii', 'utf8', etc.")
    p.add_option("-p",  "--pattern",  default="*.*",  help="file pattern, e.g., '*.txt', '*.xml', etc.")
    #p.add_option("-h",  "--help",  help="print this help message and exit")
    # parse options and arguments
    (options, args) = p.parse_args()

    if (len(args) >= 1):        
        # analyze options
        inputDirectory=args[0]
        if (inputDirectory is None):
            inputDirectory = options.input

        encoding = options.encoding        
        file_pattern=options.pattern        
        outputDirectory = options.output
        if (not  checkOptions(inputDirectory,  outputDirectory,  encoding,  file_pattern)):
            p.error("Please verify the options and arguments!!!")

        if (not inputDirectory.endswith("/")):
            inputDirectory = inputDirectory + "/"
        if (outputDirectory is not None):    
            if (not outputDirectory.endswith("/")):
                outputDirectory = outputDirectory + "/"
        # end analyze options

        # Recursively find all extension (e.g. *.txt, *.pdf) files in input directory    
        totalDocs = 0
        for input_file in find_files(inputDirectory, file_pattern):
            totalDocs = totalDocs + 1

        counter = 0    
        startTime = time.time()
        for input_file in find_files(inputDirectory, file_pattern):
            if (counter <= PROCESSED_DOCUMENTS):
                counter = counter + 1;
                continue

            # process each file
            print "[%d] Extracting plain text from file '%s'" % (counter,  input_file)
            # create output file
            output_file = None
            if (outputDirectory is not None):
                subDirectoryName=getSubDirectoryName(inputDirectory,  outputDirectory,  input_file)
                output_file = makeOutputFile(input_file,  subDirectoryName,  outputDirectory)

            processFile(input_file,  output_file,  encoding)

            if (counter % 1000 == 0):
                print "**** Task progress : [%.2f %%]" % ((100.0 * counter)/totalDocs)
                time.sleep(2)                
            counter = counter + 1
        # end for
        print "%d/%s files processed." % (counter,  totalDocs)
        endTime = time.time()
        printCompletionTime(startTime,  endTime)
    else:
        p.print_help()
        p.error('*** Input directory must be specified!!')
 

Last edit: Duy Dinh 2014-05-08