#9 Patch for hmm.py

closed-accepted
nobody
None
5
2008-06-20
2008-06-19
No

Here's a patch for tag/hmm.py, solving bugs #1934805 and #1969232. The patch is below and the final file is attached.

\$ diff -w -U 2 hmm*.py
--- hmm.py 2008-06-02 14:07:08.000000000 +0200
+++ hmm-new.py 2008-06-19 13:34:08.000000000 +0200
@@ -181,7 +181,5 @@

path = self.best_path(unlabeled_sequence)
- for i in range(len(path)):
- unlabeled_sequence[i] = (unlabeled_sequence[i][_TEXT], path[i])
- return unlabeled_sequence
+ return zip(unlabeled_sequence, path)

def _output_logprob(self, state, symbol):
@@ -212,5 +210,5 @@
self._cache = (P, O, X, S)

- def best_path(self, unlabeled_sequence):
+ def best_path(self, symbols):
"""
Returns the state sequence of the optimal (most probable) path through
@@ -224,5 +222,4 @@
"""

- symbols = [token[_TEXT] for token in unlabeled_sequence]
T = len(symbols)
N = len(self._states)
@@ -270,5 +267,5 @@

# find the starting log probabilities for each state
- symbol = unlabeled_sequence[0][_TEXT]
+ symbol = unlabeled_sequence[0]
for i, state in enumerate(self._states):
V[0, i] = self._priors.logprob(state) + \ @@ -278,5 +275,5 @@
# find the maximum log probabilities for reaching each state at time t
for t in range(1, T):
- symbol = unlabeled_sequence[t][_TEXT]
+ symbol = unlabeled_sequence[t]
for j in range(N):
sj = self._states[j]
@@ -897,5 +894,5 @@
print 'Testing with state sequence', test
print 'probability =', model.probability(sequence)
- print 'tagging = ', model.tag(sequence)
+ print 'tagging = ', model.tag([word for (word,tag) in sequence])
print 'p(tagged) = ', model.probability(sequence)
print 'H = ', model.entropy(sequence)
@@ -911,11 +908,9 @@
sentences = brown.tagged_sents(categories='a')[:num_sents]

- sequences = []
- sequence = []
- symbols = set()
-
tag_re = re.compile(r'[*]|--|[^+*-]+')
tag_set = set()
+ symbols = set()

+ cleaned_sentences = []
for sentence in sentences:
for i in range(len(sentence)):
@@ -927,6 +922,7 @@
sentence[i] = (word, tag) # store cleaned-up tagged token
+ cleaned_sentences += [sentence]

- return sentences, list(tag_set), list(symbols)
+ return cleaned_sentences, list(tag_set), list(symbols)

def test_pos(model, sentences, display=False):
@@ -936,5 +932,5 @@
for sentence in sentences:
orig_tags = [token[_TAG] for token in sentence]
- sentence = [(token[_TEXT], None) for token in sentence]
+ sentence = [token[_TEXT] for token in sentence]
new_tags = model.best_path(sentence)
if display:
@@ -944,5 +940,5 @@
print new_tags
print 'Entropy:'
- print model.entropy(sentence)
+ print model.entropy([(word,None) for word in sentence])
print '-' * 60
else:

Discussion

• peter ljunglöf - 2008-06-19

New version of tag/hmm.py

• Steven Bird - 2008-06-20

Logged In: YES
user_id=195736
Originator: NO

• Steven Bird - 2008-06-20
• status: open --> closed-accepted