[Tess-developers] TheSpamSecretary TheSpamSecretary.py,1.15,1.16
Brought to you by:
kwerle
|
From: <kw...@us...> - 2003-06-18 23:34:58
|
Update of /cvsroot/tess/TheSpamSecretary
In directory sc8-pr-cvs1:/tmp/cvs-serv26617
Modified Files:
TheSpamSecretary.py
Log Message:
Better parsing of html mail. Added weight to URLs within html text. Made pure non-spam words worth less than pure-spam words.
Index: TheSpamSecretary.py
===================================================================
RCS file: /cvsroot/tess/TheSpamSecretary/TheSpamSecretary.py,v
retrieving revision 1.15
retrieving revision 1.16
diff -C2 -d -r1.15 -r1.16
*** TheSpamSecretary.py 5 May 2003 15:44:36 -0000 1.15
--- TheSpamSecretary.py 18 Jun 2003 23:34:55 -0000 1.16
***************
*** 77,81 ****
self.debugFilter = 0
self.MINIMUM_GOOD_MESSAGE_COUNT = 40
! self.MAX_WORD_LENGTH = 20
self.myWordDefinition = re.compile("[^\w\-'\$]+") ##" alphanumeric characters, dashes, apostrophes, and dollar signs"
--- 77,81 ----
self.debugFilter = 0
self.MINIMUM_GOOD_MESSAGE_COUNT = 40
! self.MAX_WORD_LENGTH = 40
self.myWordDefinition = re.compile("[^\w\-'\$]+") ##" alphanumeric characters, dashes, apostrophes, and dollar signs"
***************
*** 157,161 ****
return .4
if (bad_count == 0):
! return .01
if (good_count == 0):
return .99
--- 157,161 ----
return .4
if (bad_count == 0):
! return .0101
if (good_count == 0):
return .99
***************
*** 164,168 ****
else:
return 0.0
! return max(0.01, (min(0.99, returnValue)))
#print(self.tempDict)
--- 164,168 ----
else:
return 0.0
! return max(0.0101, (min(0.99, returnValue)))
#print(self.tempDict)
***************
*** 472,475 ****
--- 472,478 ----
self.logFile.write("Failed to decode something of type %s\n" % aMessage.getmaintype())
self.logFile.write("Failed for subject %s\n" % aMessage.getheader('Subject'))
+ if (re.search("html", aMessage.gettype())):
+ self.scanURLs(outputData.getvalue(), self.tempDict)
+ outputData = self.stripComments(outputData.getvalue())
#print("MSXXX:%s:MEXXX" % outputData.getvalue())
self.addTokensFromTextToDict(outputData.getvalue(), self.tempDict)
***************
*** 484,488 ****
while multiFile.next():
onePart = mimetools.Message(multiFile)
! #print("TYPE: %s" % onePart.gettype())
if (not (re.search("application", onePart.gettype()) or re.search("image", onePart.gettype()))):
--- 487,491 ----
while multiFile.next():
onePart = mimetools.Message(multiFile)
! #sys.stderr.write("TYPE: %s" % onePart.gettype())
if (not (re.search("application", onePart.gettype()) or re.search("image", onePart.gettype()))):
***************
*** 492,496 ****
self.logFile.write("Failed to decode something of type %s\n" % onePart.getencoding())
if (re.search("html", onePart.gettype())):
! outputData = self.stripComments(outputData.getvalue());
#else:
# print("NO DECODE")
--- 495,500 ----
self.logFile.write("Failed to decode something of type %s\n" % onePart.getencoding())
if (re.search("html", onePart.gettype())):
! self.scanURLs(outputData.getvalue(), self.tempDict)
! outputData = self.stripComments(outputData.getvalue())
#else:
# print("NO DECODE")
***************
*** 498,501 ****
--- 502,509 ----
self.addTokensFromTextToDict(outputData.getvalue(), self.tempDict)
else:
+ if ((aMessage.getheader("content-type") != None) and re.search("html", aMessage.getheader("content-type"))):
+ self.scanURLs(outputData.getvalue(), self.tempDict)
+ outputData = self.stripComments(outputData.getvalue())
+
self.addTokensFromTextToDict(outputData.getvalue(), self.tempDict)
while (1):
***************
*** 506,509 ****
--- 514,520 ----
someFile.seek(lastPosition)
break
+ if ((aMessage.getheader("content-type") != None) and re.search("html", aMessage.getheader("content-type"))):
+ self.scanURLs(oneLine, self.tempDict)
+ oneLine = self.stripComments(oneLine).getvalue()
self.addTokensFromTextToDict(oneLine, self.tempDict)
#print("oneline: %s" % oneLine)
***************
*** 512,515 ****
--- 523,541 ----
##################################################
+ def scanURLs(self, someText, someDict):
+ """
+ Scan the html text and add any found hosts to someDict.
+ """
+ outputData = StringIO.StringIO()
+ HOSTNAMEDEF = "\w\-\.\_" # 0-9 a-z A-Z - . _
+ urls = re.findall("http://.*?([" + HOSTNAMEDEF + "]+)/", someText)
+ if ((self.debugFilter) and (len(urls) > 0)):
+ sys.stdout.write("Found url hosts: %s\n" % urls)
+
+ for one_word in urls:
+ self.addOneTokenToDict("URLHOST:" + one_word, someDict)
+
+ ##################################################
+
def stripComments(self, someText):
"""
***************
*** 534,551 ****
found_words = self.myWordDefinition.split(someText)
for one_word in found_words:
! #the word has to have at least one alpha
! if ((one_word == '') or (not self.myCharDefinition.search(one_word))):
! continue
! if (len(one_word) > self.MAX_WORD_LENGTH):
! continue
! one_word = textType + one_word
! #sys.stderr.write("One word: %s\n" % one_word)
! word_count = someDict.get(one_word)
! try:
! someInt = int(word_count) + 1
! except:
! someInt = 1
! someDict[one_word] = someInt
! #print(self.someDict)
##################################################
--- 560,586 ----
found_words = self.myWordDefinition.split(someText)
for one_word in found_words:
! self.addOneTokenToDict(textType + one_word, someDict)
!
! ##################################################
!
! def addOneTokenToDict(self, someWord, someDict):
! """
! Add a single chunk of text to the dict
! """
! if (someWord == None):
! return;
!
! #the word has to have at least one alpha
! if ((someWord == '') or (not self.myCharDefinition.search(someWord))):
! return
! if (len(someWord) > self.MAX_WORD_LENGTH):
! return
! #sys.stderr.write("One word: %s\n" % someWord)
! word_count = someDict.get(someWord)
! try:
! someInt = int(word_count) + 1
! except:
! someInt = 1
! someDict[someWord] = someInt
##################################################
|