./cmuclmtk_trunk/src/lm_combine -lm1 lm_giga_5k_nvp_3gram.arpa -lm2 lm_giga_5k_nvp_3gram.arpa -weight w.wt -lm a.arpa
Reading in a 3-gram language model.
Number of 1-grams = 5000.
Number of 2-grams = 2821547.
Number of 3-grams = 8095821.
Reading unigrams...
Reading 2-grams...
..................................................
..................................................
.........................................
Reading 3-grams...
..................................................
..................................................
..................................................
..................................................
..................................................
..................................................
..................................................
..................................................
....Reading in a 3-gram language model.
Number of 1-grams = 5000.
Number of 2-grams = 2821547.
Number of 3-grams = 8095821.
Reading unigrams...
Reading 2-grams...
..................................................
..................................................
.........................................
Reading 3-grams...
..................................................
..................................................
..................................................
..................................................
..................................................
..................................................
..................................................
..................................................
....
combine lms
Reading in a 3-gram language model.
Number of 1-grams = 5001.
Number of 2-grams = 2821547.
Number of 3-grams = 8095821.
Reading unigrams...
Reading 2-grams...
..................................................
..................................................
.........................................Error - Repeated 2-gram in ARPA format language model.
If you would like to refer to this comment somewhere else in this project, copy and paste the following link:
the problem is actually out of order ngrams. lm_combine gives the wrong error due to what i think is a bug in its error checking. here is a patch which fixes its error checking code and also gives more informative error messages in this situation. i'm not a C programmer so you might want to check if my use of strncat is secure. i believe you can apply this patch with the command "cd cmuclmtk ; patch -p1 < lm_combine_diff".
@@ -488,17 +526,19 @@
k = arpa_lm->n;
}
else {
- if ((current_ngram[k] > previous_ngram[k]) && (j > 0)) {
- quit(-1,"Error : n-grams are not correctly ordered.\n");
+ if ((current_ngram[k] < previous_ngram[k]) && (j > 0)) {
+ generate_combined_lm_error_string(debugStr, 1000, "Error : n-grams are not correctly ordered.\n", i, j, previous_ngram, current_ngram, arpa_lm);
+ quit(-1,debugStr,i);
}
}
}
}
if (pos_of_novelty == i && j != 1)
quit(-1,"Error - Repeated %d-gram in ARPA format language model.\n",
i);
if (pos_of_novelty == (i) && j != 1) {
generate_combined_lm_error_string(debugStr, 1000, "\nError - Repeated %d-gram in ARPA format language model.\n", i, j, previous_ngram, current_ngram, arpa_lm);
quit(-1,debugStr,i);
}
+
if (pos_of_novelty != i-1) {
if (i==2) {
/ Deal with unigram pointers /
@@ -597,16 +637,17 @@
pos_of_novelty = k;
k = arpa_lm->n;
}else {
if ((current_ngram[k] > previous_ngram[k]) && (j>0)) {
quit(-1,"Error : n-grams are not correctly ordered.\n");
if ((current_ngram[k] < previous_ngram[k]) && (j>0)) {
generate_combined_lm_error_string(debugStr, 1000, "Error : n-grams are not correctly ordered.\n", i, j, previous_ngram, current_ngram, arpa_lm);
quit(-1,"Error : Same %d-gram occurs twice in ARPA format LM.\n",
arpa_lm->n);
generate_combined_lm_error_string(debugStr, 1000, "\nError - Repeated %d-gram in ARPA format language model.\n", i, j, previous_ngram, current_ngram, arpa_lm);
quit(-1,debugStr,i);
}
if (pos_of_novelty != arpa_lm->n-1) {
If you would like to refer to this comment somewhere else in this project, copy and paste the following link:
lm_giga from http://www.inference.phy.cam.ac.uk/kv227/lm_giga/ . Here's what happens:
./cmuclmtk_trunk/src/lm_combine -lm1 lm_giga_5k_nvp_3gram.arpa -lm2 lm_giga_5k_nvp_3gram.arpa -weight w.wt -lm a.arpa
Reading in a 3-gram language model.
Number of 1-grams = 5000.
Number of 2-grams = 2821547.
Number of 3-grams = 8095821.
Reading unigrams...
Reading 2-grams...
..................................................
..................................................
.........................................
Reading 3-grams...
..................................................
..................................................
..................................................
..................................................
..................................................
..................................................
..................................................
..................................................
....Reading in a 3-gram language model.
Number of 1-grams = 5000.
Number of 2-grams = 2821547.
Number of 3-grams = 8095821.
Reading unigrams...
Reading 2-grams...
..................................................
..................................................
.........................................
Reading 3-grams...
..................................................
..................................................
..................................................
..................................................
..................................................
..................................................
..................................................
..................................................
....
combine lms
Reading in a 3-gram language model.
Number of 1-grams = 5001.
Number of 2-grams = 2821547.
Number of 3-grams = 8095821.
Reading unigrams...
Reading 2-grams...
..................................................
..................................................
.........................................Error - Repeated 2-gram in ARPA format language model.
i think this is a workaround that creates a modified version of gigaword that lm_combine can handle:
cp -f lm_giga_5k_nvp_3gram/lm_giga_5k_nvp_3gram.arpa mylm5v
perl -e 'undef $/; $=<>; s/<unk>/<UNK>/; print' -i mylm5v
perl -e 'undef $/; $=<>; s/^.<unk>.\n//gm; print' -i mylm5v
onegrams=
perl -e 'undef $/; $_=<>; /\\\\1-grams:\n(.*?)\n\\\\2-grams:/s; $_=$1; $c=0; while (/\n/g) {$c++} print $c' mylm5v
twograms=
perl -e 'undef $/; $_=<>; /\\\\2-grams:\n(.*?)\n\\\\3-grams:/s; $_=$1; $c=0; while (/\n/g) {$c++} print $c' mylm5v
threegrams=
perl -e 'undef $/; $_=<>; /\\\\3-grams:\n(.*?)\n\\\\end\\\\/s; $_=$1; $c=0; while (/\n/g) {$c++} print $c' mylm5v
perl -e 's/ngram 1=./ngram 1='$onegrams'/;' -pi mylm5v
perl -e 's/ngram 2=./ngram 2='$twograms'/;' -pi mylm5v
perl -e 's/ngram 3=./ngram 3='$threegrams'/;' -pi mylm5v
perl -e 'undef $/; $_=<>; s/(.UNK.\n)//; $unknownLine = $1; s/\1-grams:\s\n/\1-grams:\n$unknownLine/; print' -i mylm5v
now you can use mylm5v in place of lm_giga_5k_nvp_3gram.arpa as an argument to lm_combine.
the problem is actually out of order ngrams. lm_combine gives the wrong error due to what i think is a bug in its error checking. here is a patch which fixes its error checking code and also gives more informative error messages in this situation. i'm not a C programmer so you might want to check if my use of strncat is secure. i believe you can apply this patch with the command "cd cmuclmtk ; patch -p1 < lm_combine_diff".
--- cmuclmtk_trunk/src/programs/lm_combine.c 2008-04-15 21:52:26.000000000 -0700
+++ cmuclmtk/src/programs/lm_combine.c 2008-04-29 00:43:21.000000000 -0700
@@ -378,6 +378,43 @@
DeleteArray(words);
}
+char generate_combined_lm_error_string(char debugStr, int debugStrSz, char initial_string, int i, int j, id__t previous_ngram, id__t current_ngram,arpa_lm_t arpa_lm)
+{
+char debug1[256], debug2[256], debug3[256], debug4[256];
+ int k2;
+
+ snprintf(debugStr, debugStrSz, initial_string);
+ snprintf(debug1, 256, "At position %d of the %d-grams\n", j, i);
+ strncat(debugStr, debug1, debugStrSz-strlen(debugStr));
+ strncat(debugStr, "previous ngram: ", debugStrSz-strlen(debugStr));
+ for (k2=0;k2<i;k2++) {
+ /strcpy(debug1,arpa_lm->vocab[arpa_lm->word_id[k2][j]]);/
+ strncat(debugStr, arpa_lm->vocab[previous_ngram[k2]], debugStrSz-strlen(debugStr));
+ strncat(debugStr, " ", debugStrSz-strlen(debugStr));
+ }
+ strncat(debugStr, "\n", debugStrSz-strlen(debugStr));
+ strncat(debugStr, "previous ngram ids: ", debugStrSz-strlen(debugStr));
+ for (k2=0;k2<i;k2++) {
+ snprintf(debug1, 256, "%d ", previous_ngram[k2], debugStrSz-strlen(debugStr));
+ strncat(debugStr, debug1, debugStrSz-strlen(debugStr));
+ }
+ strncat(debugStr, "\n", debugStrSz-strlen(debugStr));
+
+ strncat(debugStr, "next ngram: ", debugStrSz-strlen(debugStr));
+ for (k2=0;k2<i;k2++) {
+ strncat(debugStr,arpa_lm->vocab[current_ngram[k2]], debugStrSz-strlen(debugStr));
+ strncat(debugStr, " ", debugStrSz-strlen(debugStr));
+ }
+ strncat(debugStr, "\n", debugStrSz-strlen(debugStr));
+ strncat(debugStr, "next ngram ids: ", debugStrSz-strlen(debugStr));
+ for (k2=0;k2<i;k2++) {
+ snprintf(debug1, 256, "%d ", current_ngram[k2]);
+ strncat(debugStr, debug1, debugStrSz-strlen(debugStr));
+ }
+ strncat(debugStr, "\n", debugStrSz-strlen(debugStr));
+
+ return debugStr;
+}
void combine_lm(arpa_lm_t arpa_lm, arpa_lm_t lm1, arpa_lm_t lm2)
{
char in_line;
@@ -394,12 +431,13 @@
int previd;
TBROWSE_UNION bru;
char** words;
+ char debugStr[1000];
input_line_ptr_orig = input_line;
@@ -409,7 +447,7 @@
arpa_lm->num_kgrams = (ngram_sz_t ) rr_malloc(sizeof(ngram_sz_t)11);
previous_ngram = (id__t ) rr_calloc(arpa_lm->n,sizeof(id__t));
current_ngram = (id__t ) rr_calloc(arpa_lm->n,sizeof(id__t));
@@ -488,17 +526,19 @@
k = arpa_lm->n;
}
else {
- if ((current_ngram[k] > previous_ngram[k]) && (j > 0)) {
- quit(-1,"Error : n-grams are not correctly ordered.\n");
+ if ((current_ngram[k] < previous_ngram[k]) && (j > 0)) {
+ generate_combined_lm_error_string(debugStr, 1000, "Error : n-grams are not correctly ordered.\n", i, j, previous_ngram, current_ngram, arpa_lm);
+ quit(-1,debugStr,i);
}
}
}
}
+
if (pos_of_novelty != i-1) {
if (i==2) {
/ Deal with unigram pointers /
@@ -597,16 +637,17 @@
pos_of_novelty = k;
k = arpa_lm->n;
}else {
}
}
}
}
quit(-1,debugStr,i);
}
if (pos_of_novelty != arpa_lm->n-1) {