[Tess-developers] TheSpamSecretary TheSpamSecretary.py,1.15,1.16

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/tess/TheSpamSecretary
In directory sc8-pr-cvs1:/tmp/cvs-serv26617

Modified Files:
	TheSpamSecretary.py 
Log Message:
Better parsing of html mail.  Added weight to URLs within html text.  Made pure non-spam words worth less than pure-spam words.

Index: TheSpamSecretary.py
===================================================================
RCS file: /cvsroot/tess/TheSpamSecretary/TheSpamSecretary.py,v
retrieving revision 1.15
retrieving revision 1.16
diff -C2 -d -r1.15 -r1.16
*** TheSpamSecretary.py	5 May 2003 15:44:36 -0000	1.15
--- TheSpamSecretary.py	18 Jun 2003 23:34:55 -0000	1.16
***************
*** 77,81 ****
  		self.debugFilter = 0
  		self.MINIMUM_GOOD_MESSAGE_COUNT = 40
! 		self.MAX_WORD_LENGTH = 20
  
  		self.myWordDefinition = re.compile("[^\w\-'\$]+") ##" alphanumeric characters, dashes, apostrophes, and dollar signs"
--- 77,81 ----
  		self.debugFilter = 0
  		self.MINIMUM_GOOD_MESSAGE_COUNT = 40
! 		self.MAX_WORD_LENGTH = 40
  
  		self.myWordDefinition = re.compile("[^\w\-'\$]+") ##" alphanumeric characters, dashes, apostrophes, and dollar signs"
***************
*** 157,161 ****
  			return .4
  		if (bad_count == 0):
! 			return .01
  		if (good_count == 0):
  			return .99
--- 157,161 ----
  			return .4
  		if (bad_count == 0):
! 			return .0101
  		if (good_count == 0):
  			return .99
***************
*** 164,168 ****
  		else:
  			return 0.0
! 		return max(0.01, (min(0.99, returnValue)))
  		#print(self.tempDict)
  
--- 164,168 ----
  		else:
  			return 0.0
! 		return max(0.0101, (min(0.99, returnValue)))
  		#print(self.tempDict)
  
***************
*** 472,475 ****
--- 472,478 ----
  						self.logFile.write("Failed to decode something of type %s\n" % aMessage.getmaintype())
  						self.logFile.write("Failed for subject %s\n" % aMessage.getheader('Subject'))
+ 					if (re.search("html", aMessage.gettype())):
+ 						self.scanURLs(outputData.getvalue(), self.tempDict)
+ 						outputData = self.stripComments(outputData.getvalue())
  					#print("MSXXX:%s:MEXXX" % outputData.getvalue())
  					self.addTokensFromTextToDict(outputData.getvalue(), self.tempDict)
***************
*** 484,488 ****
  					while multiFile.next():
  						onePart = mimetools.Message(multiFile)
! 						#print("TYPE: %s" % onePart.gettype())
  						if (not (re.search("application", onePart.gettype()) or re.search("image", onePart.gettype()))):
  								
--- 487,491 ----
  					while multiFile.next():
  						onePart = mimetools.Message(multiFile)
! 						#sys.stderr.write("TYPE: %s" % onePart.gettype())
  						if (not (re.search("application", onePart.gettype()) or re.search("image", onePart.gettype()))):
  								
***************
*** 492,496 ****
  								self.logFile.write("Failed to decode something of type %s\n" % onePart.getencoding())
  							if (re.search("html", onePart.gettype())):
! 								outputData = self.stripComments(outputData.getvalue());
  						#else:
  						#	print("NO DECODE")
--- 495,500 ----
  								self.logFile.write("Failed to decode something of type %s\n" % onePart.getencoding())
  							if (re.search("html", onePart.gettype())):
! 								self.scanURLs(outputData.getvalue(), self.tempDict)
! 								outputData = self.stripComments(outputData.getvalue())
  						#else:
  						#	print("NO DECODE")
***************
*** 498,501 ****
--- 502,509 ----
  					self.addTokensFromTextToDict(outputData.getvalue(), self.tempDict)
  				else:
+ 					if ((aMessage.getheader("content-type") != None) and re.search("html", aMessage.getheader("content-type"))):
+ 						self.scanURLs(outputData.getvalue(), self.tempDict)
+ 						outputData = self.stripComments(outputData.getvalue())
+ 
  					self.addTokensFromTextToDict(outputData.getvalue(), self.tempDict)
  					while (1):
***************
*** 506,509 ****
--- 514,520 ----
  							someFile.seek(lastPosition)
  							break
+ 						if ((aMessage.getheader("content-type") != None) and re.search("html", aMessage.getheader("content-type"))):
+ 							self.scanURLs(oneLine, self.tempDict)
+ 							oneLine = self.stripComments(oneLine).getvalue()
  						self.addTokensFromTextToDict(oneLine, self.tempDict)
  						#print("oneline: %s" % oneLine)
***************
*** 512,515 ****
--- 523,541 ----
  ##################################################
  
+ 	def scanURLs(self, someText, someDict):
+ 		"""
+ 		Scan the html text and add any found hosts to someDict.
+ 		"""
+ 		outputData = StringIO.StringIO()
+ 		HOSTNAMEDEF = "\w\-\.\_" # 0-9 a-z A-Z - . _
+ 		urls = re.findall("http://.*?([" + HOSTNAMEDEF + "]+)/", someText)
+ 		if ((self.debugFilter) and (len(urls) > 0)):
+ 			sys.stdout.write("Found url hosts: %s\n" % urls)
+ 
+ 		for one_word in urls:
+ 			self.addOneTokenToDict("URLHOST:" + one_word, someDict)
+ 
+ ##################################################
+ 
  	def stripComments(self, someText):
  		"""
***************
*** 534,551 ****
  		found_words = self.myWordDefinition.split(someText)
  		for one_word in found_words:
! 			#the word has to have at least one alpha
! 			if ((one_word == '') or (not self.myCharDefinition.search(one_word))):
! 				continue
! 			if (len(one_word) > self.MAX_WORD_LENGTH):
! 				continue
! 			one_word = textType + one_word
! 			#sys.stderr.write("One word: %s\n" % one_word)
! 			word_count = someDict.get(one_word)
! 			try:
! 				someInt = int(word_count) + 1
! 			except:
! 				someInt = 1
! 			someDict[one_word] = someInt
! 		#print(self.someDict)
  
  ##################################################
--- 560,586 ----
  		found_words = self.myWordDefinition.split(someText)
  		for one_word in found_words:
! 			self.addOneTokenToDict(textType + one_word, someDict)
! 
! ##################################################
! 
! 	def addOneTokenToDict(self, someWord, someDict):
! 		"""
! 		Add a single chunk of text to the dict
! 		"""
! 		if (someWord == None):
! 			return;
! 
! 		#the word has to have at least one alpha
! 		if ((someWord == '') or (not self.myCharDefinition.search(someWord))):
! 			return
! 		if (len(someWord) > self.MAX_WORD_LENGTH):
! 			return
! 		#sys.stderr.write("One word: %s\n" % someWord)
! 		word_count = someDict.get(someWord)
! 		try:
! 			someInt = int(word_count) + 1
! 		except:
! 			someInt = 1
! 		someDict[someWord] = someInt
  
  ##################################################