Menu

lm_combine fails on gigaword: repeated 2-gram

Help
2008-04-16
2012-09-22
  • Bayle Shanks

    Bayle Shanks - 2008-04-16

    lm_giga from http://www.inference.phy.cam.ac.uk/kv227/lm_giga/ . Here's what happens:

    ./cmuclmtk_trunk/src/lm_combine -lm1 lm_giga_5k_nvp_3gram.arpa -lm2 lm_giga_5k_nvp_3gram.arpa -weight w.wt -lm a.arpa
    Reading in a 3-gram language model.
    Number of 1-grams = 5000.
    Number of 2-grams = 2821547.
    Number of 3-grams = 8095821.
    Reading unigrams...

    Reading 2-grams...
    ..................................................
    ..................................................
    .........................................
    Reading 3-grams...
    ..................................................
    ..................................................
    ..................................................
    ..................................................
    ..................................................
    ..................................................
    ..................................................
    ..................................................
    ....Reading in a 3-gram language model.
    Number of 1-grams = 5000.
    Number of 2-grams = 2821547.
    Number of 3-grams = 8095821.
    Reading unigrams...

    Reading 2-grams...
    ..................................................
    ..................................................
    .........................................
    Reading 3-grams...
    ..................................................
    ..................................................
    ..................................................
    ..................................................
    ..................................................
    ..................................................
    ..................................................
    ..................................................
    ....
    combine lms
    Reading in a 3-gram language model.
    Number of 1-grams = 5001.
    Number of 2-grams = 2821547.
    Number of 3-grams = 8095821.
    Reading unigrams...

    Reading 2-grams...
    ..................................................
    ..................................................
    .........................................Error - Repeated 2-gram in ARPA format language model.

     
    • Bayle Shanks

      Bayle Shanks - 2008-04-30

      i think this is a workaround that creates a modified version of gigaword that lm_combine can handle:

      cp -f lm_giga_5k_nvp_3gram/lm_giga_5k_nvp_3gram.arpa mylm5v
      perl -e 'undef $/; $=<>; s/<unk>/<UNK>/; print' -i mylm5v
      perl -e 'undef $/; $
      =<>; s/^.<unk>.\n//gm; print' -i mylm5v
      onegrams=perl -e 'undef $/; $_=&lt;&gt;; /\\\\1-grams:\n(.*?)\n\\\\2-grams:/s; $_=$1; $c=0; while (/\n/g) {$c++} print $c' mylm5v
      twograms=perl -e 'undef $/; $_=&lt;&gt;; /\\\\2-grams:\n(.*?)\n\\\\3-grams:/s; $_=$1; $c=0; while (/\n/g) {$c++} print $c' mylm5v
      threegrams=perl -e 'undef $/; $_=&lt;&gt;; /\\\\3-grams:\n(.*?)\n\\\\end\\\\/s; $_=$1; $c=0; while (/\n/g) {$c++} print $c' mylm5v
      perl -e 's/ngram 1=./ngram 1='$onegrams'/;' -pi mylm5v
      perl -e 's/ngram 2=.
      /ngram 2='$twograms'/;' -pi mylm5v
      perl -e 's/ngram 3=./ngram 3='$threegrams'/;' -pi mylm5v
      perl -e 'undef $/; $_=<>; s/(.
      UNK.\n)//; $unknownLine = $1; s/\1-grams:\s\n/\1-grams:\n$unknownLine/; print' -i mylm5v

      now you can use mylm5v in place of lm_giga_5k_nvp_3gram.arpa as an argument to lm_combine.

       
    • Bayle Shanks

      Bayle Shanks - 2008-04-29

      the problem is actually out of order ngrams. lm_combine gives the wrong error due to what i think is a bug in its error checking. here is a patch which fixes its error checking code and also gives more informative error messages in this situation. i'm not a C programmer so you might want to check if my use of strncat is secure. i believe you can apply this patch with the command "cd cmuclmtk ; patch -p1 < lm_combine_diff".

      --- cmuclmtk_trunk/src/programs/lm_combine.c 2008-04-15 21:52:26.000000000 -0700
      +++ cmuclmtk/src/programs/lm_combine.c 2008-04-29 00:43:21.000000000 -0700
      @@ -378,6 +378,43 @@
      DeleteArray(words);
      }

      +char generate_combined_lm_error_string(char debugStr, int debugStrSz, char initial_string, int i, int j, id__t previous_ngram, id__t current_ngram,arpa_lm_t arpa_lm)
      +{
      +char debug1[256], debug2[256], debug3[256], debug4[256];
      + int k2;
      +
      + snprintf(debugStr, debugStrSz, initial_string);
      + snprintf(debug1, 256, "At position %d of the %d-grams\n", j, i);
      + strncat(debugStr, debug1, debugStrSz-strlen(debugStr));
      + strncat(debugStr, "previous ngram: ", debugStrSz-strlen(debugStr));
      + for (k2=0;k2<i;k2++) {
      + /strcpy(debug1,arpa_lm->vocab[arpa_lm->word_id[k2][j]]);/
      + strncat(debugStr, arpa_lm->vocab[previous_ngram[k2]], debugStrSz-strlen(debugStr));
      + strncat(debugStr, " ", debugStrSz-strlen(debugStr));
      + }
      + strncat(debugStr, "\n", debugStrSz-strlen(debugStr));
      + strncat(debugStr, "previous ngram ids: ", debugStrSz-strlen(debugStr));
      + for (k2=0;k2<i;k2++) {
      + snprintf(debug1, 256, "%d ", previous_ngram[k2], debugStrSz-strlen(debugStr));
      + strncat(debugStr, debug1, debugStrSz-strlen(debugStr));
      + }
      + strncat(debugStr, "\n", debugStrSz-strlen(debugStr));
      +
      + strncat(debugStr, "next ngram: ", debugStrSz-strlen(debugStr));
      + for (k2=0;k2<i;k2++) {
      + strncat(debugStr,arpa_lm->vocab[current_ngram[k2]], debugStrSz-strlen(debugStr));
      + strncat(debugStr, " ", debugStrSz-strlen(debugStr));
      + }
      + strncat(debugStr, "\n", debugStrSz-strlen(debugStr));
      + strncat(debugStr, "next ngram ids: ", debugStrSz-strlen(debugStr));
      + for (k2=0;k2<i;k2++) {
      + snprintf(debug1, 256, "%d ", current_ngram[k2]);
      + strncat(debugStr, debug1, debugStrSz-strlen(debugStr));
      + }
      + strncat(debugStr, "\n", debugStrSz-strlen(debugStr));
      +
      + return debugStr;
      +}
      void combine_lm(arpa_lm_t arpa_lm, arpa_lm_t lm1, arpa_lm_t lm2)
      {
      char
      in_line;
      @@ -394,12 +431,13 @@
      int previd;
      TBROWSE_UNION bru;
      char** words;
      + char debugStr[1000];

      words=(char**)NewArray(15,MAX_WORD,sizeof(char));
      
      in_line = (char *) rr_malloc(1024*sizeof(char));
      input_line = (char *) rr_malloc(1024*sizeof(char));
      
      • +
        input_line_ptr_orig = input_line;

      @@ -409,7 +447,7 @@
      arpa_lm->num_kgrams = (ngram_sz_t ) rr_malloc(sizeof(ngram_sz_t)11);

      calc_merged_ngram_num(arpa_lm, lm1, lm2);
      
      • +
        previous_ngram = (id__t ) rr_calloc(arpa_lm->n,sizeof(id__t));
        current_ngram = (id__t
        ) rr_calloc(arpa_lm->n,sizeof(id__t));

      @@ -488,17 +526,19 @@
      k = arpa_lm->n;
      }
      else {
      - if ((current_ngram[k] > previous_ngram[k]) && (j > 0)) {
      - quit(-1,"Error : n-grams are not correctly ordered.\n");
      + if ((current_ngram[k] < previous_ngram[k]) && (j > 0)) {
      + generate_combined_lm_error_string(debugStr, 1000, "Error : n-grams are not correctly ordered.\n", i, j, previous_ngram, current_ngram, arpa_lm);
      + quit(-1,debugStr,i);
      }
      }
      }
      }

      • if (pos_of_novelty == i && j != 1)
      • quit(-1,"Error - Repeated %d-gram in ARPA format language model.\n",
      • i);
      • if (pos_of_novelty == (i) && j != 1) {
      • generate_combined_lm_error_string(debugStr, 1000, "\nError - Repeated %d-gram in ARPA format language model.\n", i, j, previous_ngram, current_ngram, arpa_lm);
      • quit(-1,debugStr,i);
      • }
        +
        if (pos_of_novelty != i-1) {
        if (i==2) {
        / Deal with unigram pointers /
        @@ -597,16 +637,17 @@
        pos_of_novelty = k;
        k = arpa_lm->n;
        }else {
      • if ((current_ngram[k] > previous_ngram[k]) && (j>0)) {
      • quit(-1,"Error : n-grams are not correctly ordered.\n");
      • if ((current_ngram[k] < previous_ngram[k]) && (j>0)) {
      • generate_combined_lm_error_string(debugStr, 1000, "Error : n-grams are not correctly ordered.\n", i, j, previous_ngram, current_ngram, arpa_lm);
      • quit(-1,debugStr,i);
        }
        }
        }
        }
        if ( pos_of_novelty == arpa_lm-&gt;n+1 &amp;&amp; j != 1 ) {
        
        • quit(-1,"Error : Same %d-gram occurs twice in ARPA format LM.\n",
        • arpa_lm->n);
        • generate_combined_lm_error_string(debugStr, 1000, "\nError - Repeated %d-gram in ARPA format language model.\n", i, j, previous_ngram, current_ngram, arpa_lm);
        • quit(-1,debugStr,i);
          }

          if (pos_of_novelty != arpa_lm->n-1) {

       

Log in to post a comment.

Want the latest updates on software, tech news, and AI?
Get latest updates about software, tech news, and AI from SourceForge directly in your inbox once a month.