/ Initialize new base model structure with params from base. /
E_INFO("Building DMP model...\n");
model = ckd_calloc(1, sizeof(model));
newbase = &model->base;
ngram_model_init(newbase, &ngram_model_dmp_funcs,
logmath_retain(base->lmath),
base->n, base->n_counts);
/ Copy N-gram counts over. /
memcpy(newbase->n_counts, base->n_counts,
base->n * sizeof(base->n_counts));
/ Make sure word strings are freed. /
newbase->writable = TRUE;
/ Initialize unigram table and string table. /
model->lm3g.unigrams = new_unigram_table(newbase->n_counts + 1);
for (itor = ngram_model_mgrams(base, 0); itor;
itor = ngram_iter_next(itor)) {
int32 prob1, bo_wt1;
int32 const *wids;
/ Can't guarantee they will go in unigram order, so just to be correct, we do this... */
wids = ngram_iter_get(itor, &prob1, &bo_wt1);
model->lm3g.unigrams[wids].prob1.l = prob1;
model->lm3g.unigrams[wids].bo_wt1.l = bo_wt1;
newbase->word_str[wids] = ckd_salloc(ngram_word(base, wids));
if ((hash_table_enter_int32(newbase->wid,
newbase->word_str[wids], wids))
!= wids) {
E_WARN("Duplicate word in dictionary: %s\n", newbase->word_str[wids]);
}
}
E_INFO("%8d = #unigrams created\n", newbase->n_counts);
/ Construct quantized probability table for bigrams and (optionally) trigrams. Hesitate to use the "sorted list" thing since it isn't so useful, but it's there already. /
init_sorted_list(&sorted_prob2);
if (newbase->n > 2) {
init_sorted_list(&sorted_bo_wt2);
init_sorted_list(&sorted_prob3);
}
/ Construct bigram and trigram arrays. /
bgptr = model->lm3g.bigrams = ckd_calloc(newbase->n_counts + 1,
sizeof(bigram_t));
if (newbase->n > 2) {
tgptr = model->lm3g.trigrams = ckd_calloc(newbase->n_counts,
sizeof(trigram_t));
model->lm3g.tseg_base =
ckd_calloc((newbase->n_counts + 1) / BG_SEG_SZ + 1, sizeof(int32));
}
else
tgptr = NULL;
/ Since bigrams and trigrams have to be contiguous with others with the same N-1-gram, we traverse them in depth-first order to build the bigram and trigram arrays. /
for (i = 0; i < newbase->n_counts; ++i) {
ngram_iter_t uitor;
bgcount = bgptr - model->lm3g.bigrams;
/ First bigram index (same as next if no bigrams...) /
model->lm3g.unigrams_.bigrams = bgcount;
E_DEBUG(2, ("unigram %d: %s => bigram %d\n", i, newbase->word_str_, bgcount));
/ All bigrams corresponding to unigram i /
uitor = ngram_ng_iter(base, i, NULL, 0);
for (itor = ngram_iter_successors(uitor);
itor; ++bgptr, itor = ngram_iter_next(itor)) {
int32 prob2, bo_wt2;
int32 const wids;
ngram_iter_t *titor;
wids = ngram_iter_get(itor, &prob2, &bo_wt2);
/ FIXME FIXME FIXME: not sure why this happens... /
if (bgptr - model->lm3g.bigrams >= newbase->n_counts) {
ngram_iter_free(itor);
break;
}
/ Backoff weight (only if there are trigrams...) /
bgptr->bo_wt2 = sorted_id(&sorted_bo_wt2, &bo_wt2);
/ Find bigram segment for this bigram (this isn't used unless there are trigrams) /
seg = bgcount >> LOG_BG_SEG_SZ;
/ If we just crossed a bigram segment boundary, then point tseg_base for the new segment to the current trigram pointer. /
if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
model->lm3g.tseg_base = tgcount;
/ Now calculate the trigram offset. */
bgptr->trigrams = tgcount - model->lm3g.tseg_base;
E_DEBUG(2, ("bigram %d %s %s => trigram %d:%d\n",
bgcount,
newbase->word_str[wids],
newbase->word_str[wids],
seg, bgptr->trigrams));
/ And fill in successors' trigram info. /
for (titor = ngram_iter_successors(itor);
titor; ++tgptr, titor = ngram_iter_next(titor)) {
int32 prob3, dummy;
You probably want to provide lm file that causes the problem.
If you would like to refer to this comment somewhere else in this project, copy and paste the following link:
Anonymous
-
2010-03-28
I've tried various things there and various settings to the limited help
supported usage for sphinx_lm_convert
The Silence tages I've made upper case in the filler and built the acoustic
model from it okay.
I've tried arguments in the converter like mapping and setting debug levels,
and anything I can the formats and possible settings there is an unknown (not
shown with help).
I still suspect the newline variations in windows to unix, although that has
never proven to be an issue here.
I've also used the new online model generator and downloaded the tar and tried
those files directly but still the same stuff and problem.
Here's the simple number only (no zero) LM file that causes the crash.
Language model created by QuickLM on Sun Mar 28 05:36:22 EDT 2010
Copyright (c) 1996-2010 Carnegie Mellon University and Alexander I. Rudnicky
The model is in standard ARPA format, designed by Doug Paul while he was at
MITRE.
The code that was used to produce this language model is available in Open
Source.
Please visit http://www.speech.cs.cmu.edu/tools/ for
more information
The (fixed) discount mass is 0.5. The backoffs are computed using the ratio
method.
This model based on a corpus of 20 sentences and 11 words
\data\
ngram 1=11
ngram 2=81
ngram 3=146
\1-grams:
-1.3010 -0.3010
-1.3010 -0.0792
-1.1871 EIGHT -0.0414
-1.4260 FIVE -0.0792
-1.3010 FOUR -0.1303
-1.5229 NINE -0.0917
-1.3233 ONE -0.1569
-1.3979 SEVEN -0.1335
-1.3716 SIX -0.0917
-1.2596 THREE -0.0414
-1.4881 TWO -0.1173
\2-grams:
-1.0000 EIGHT -0.2285
-1.3010 FIVE -0.2389
-1.0000 FOUR -0.1139
-1.3010 NINE -0.2218
-1.1249 ONE -0.2264
-1.6021 SEVEN -0.2583
-1.6021 SIX -0.2467
-1.3010 THREE -0.2374
-1.6021 TWO -0.2663
-1.4150 EIGHT -0.3010
-0.9379 EIGHT EIGHT -0.1413
-1.7160 EIGHT FIVE -0.2711
-1.0170 EIGHT FOUR -0.1614
-1.4150 EIGHT NINE -0.2430
-1.4150 EIGHT ONE -0.1836
-1.4150 EIGHT SEVEN -0.2109
-1.4150 EIGHT SIX -0.2467
-1.4150 EIGHT THREE -0.2486
-1.4150 EIGHT TWO -0.1648
-1.4771 FIVE -0.3010
-1.0000 FIVE EIGHT -0.1761
-1.1761 FIVE FIVE -0.2553
-1.4771 FIVE NINE -0.2825
-1.1761 FIVE ONE -0.2398
-1.1761 FIVE SEVEN -0.1938
-1.4771 FIVE SIX -0.2747
-1.1761 FIVE THREE -0.2374
-1.4771 FIVE TWO -0.2285
-1.1249 FOUR -0.3010
-1.6021 FOUR EIGHT -0.2840
-1.0000 FOUR FOUR -0.1903
-1.0000 FOUR NINE -0.1513
-1.0000 FOUR SIX -0.1845
-1.3010 FOUR THREE -0.2374
-1.3010 FOUR TWO -0.2083
-1.3802 NINE -0.3010
-1.3802 NINE EIGHT -0.2478
-1.3802 NINE FOUR -0.2788
-1.3802 NINE ONE -0.2527
-1.0792 NINE SEVEN -0.2583
-1.0792 NINE SIX -0.2609
-1.0792 NINE THREE -0.2704
-1.0792 NINE TWO -0.2083
-1.1027 ONE -0.3010
-0.9777 ONE EIGHT -0.1532
-0.8808 ONE FIVE -0.2041
-0.9777 ONE ONE -0.1526
-1.2788 ONE SEVEN -0.2430
-1.5798 ONE SIX -0.2747
-1.2041 SEVEN -0.3010
-1.2041 SEVEN EIGHT -0.2285
-1.5051 SEVEN NINE -0.2632
-1.0280 SEVEN ONE -0.1836
-1.2041 SEVEN SEVEN -0.2109
-0.9031 SEVEN THREE -0.1761
-1.2041 SEVEN TWO -0.2663
-1.5315 SIX -0.3010
-1.0544 SIX EIGHT -0.1761
-1.5315 SIX FOUR -0.2672
-1.2304 SIX ONE -0.1836
-1.2304 SIX SEVEN -0.2430
-1.2304 SIX SIX -0.2609
-0.9294 SIX THREE -0.2374
-1.2304 SIX TWO -0.2083
-1.1663 THREE -0.3010
-1.6435 THREE EIGHT -0.2840
-1.3424 THREE FIVE -0.2389
-1.0414 THREE FOUR -0.1614
-1.3424 THREE NINE -0.2430
-1.6435 THREE ONE -0.2775
-1.6435 THREE SEVEN -0.2430
-1.0414 THREE SIX -0.2167
-1.1663 THREE THREE -0.1761
-1.6435 THREE TWO -0.2478
-0.8129 TWO -0.3010
-1.4150 TWO EIGHT -0.2571
-0.9379 TWO FIVE -0.2218
-1.4150 TWO FOUR -0.2553
-1.4150 TWO ONE -0.2653
-1.1139 TWO SEVEN -0.2272
-1.4150 TWO THREE -0.2910
\3-grams:
-0.9031 EIGHT NINE
-0.9031 EIGHT SEVEN
-0.9031 EIGHT SIX
-0.9031 EIGHT THREE
-0.6021 FIVE EIGHT
-0.6021 FIVE SIX
-0.9031 FOUR FOUR
-0.9031 FOUR NINE
-0.9031 FOUR SIX
-0.9031 FOUR THREE
-0.6021 NINE SEVEN
-0.6021 NINE TWO
-0.4771 ONE FIVE
-0.7782 ONE SIX
-0.3010 SEVEN ONE
-0.3010 SIX THREE
-0.6021 THREE FIVE
-0.6021 THREE FOUR
-0.3010 TWO SEVEN
-1.0792 EIGHT EIGHT EIGHT
-1.0792 EIGHT EIGHT FIVE
-1.0792 EIGHT EIGHT FOUR
-0.7782 EIGHT EIGHT ONE
-1.0792 EIGHT EIGHT TWO
-0.3010 EIGHT FIVE FIVE
-1.0000 EIGHT FOUR
-0.6990 EIGHT FOUR FOUR
-0.6990 EIGHT FOUR NINE
-0.6021 EIGHT NINE EIGHT
-0.6021 EIGHT NINE THREE
-0.6021 EIGHT ONE EIGHT
-0.6021 EIGHT ONE FIVE
-0.6021 EIGHT SEVEN
-0.6021 EIGHT SEVEN THREE
-0.6021 EIGHT SIX SEVEN
-0.6021 EIGHT SIX SIX
-0.6021 EIGHT THREE
-0.6021 EIGHT THREE NINE
-0.6021 EIGHT TWO
-0.6021 EIGHT TWO FIVE
-0.7782 FIVE EIGHT EIGHT
-0.7782 FIVE EIGHT FOUR
-0.7782 FIVE EIGHT THREE
-0.6021 FIVE FIVE NINE
-0.6021 FIVE FIVE SEVEN
-0.3010 FIVE NINE
-0.6021 FIVE ONE
-0.6021 FIVE ONE SEVEN
-0.6021 FIVE SEVEN ONE
-0.6021 FIVE SEVEN THREE
-0.3010 FIVE SIX TWO
-0.6021 FIVE THREE FOUR
-0.6021 FIVE THREE NINE
-0.3010 FIVE TWO
-0.3010 FOUR EIGHT
-0.9031 FOUR FOUR EIGHT
-0.9031 FOUR FOUR FOUR
-0.6021 FOUR FOUR SIX
-0.9031 FOUR NINE FOUR
-0.9031 FOUR NINE SEVEN
-0.9031 FOUR NINE SIX
-0.9031 FOUR NINE TWO
-0.9031 FOUR SIX
-0.9031 FOUR SIX EIGHT
-0.9031 FOUR SIX ONE
-0.9031 FOUR SIX SIX
-0.6021 FOUR THREE FIVE
-0.6021 FOUR THREE SIX
-0.6021 FOUR TWO
-0.6021 FOUR TWO EIGHT
-0.3010 NINE EIGHT EIGHT
-0.3010 NINE FOUR TWO
-0.3010 NINE ONE EIGHT
-0.6021 NINE SEVEN NINE
-0.6021 NINE SEVEN TWO
-0.6021 NINE SIX FOUR
-0.6021 NINE SIX ONE
-0.3010 NINE THREE THREE
-0.6021 NINE TWO
-0.6021 NINE TWO THREE
-0.9031 ONE EIGHT EIGHT
-0.9031 ONE EIGHT FOUR
-0.9031 ONE EIGHT NINE
-0.9031 ONE EIGHT SEVEN
-1.0000 ONE FIVE
-1.0000 ONE FIVE ONE
-0.6990 ONE FIVE THREE
-1.0000 ONE FIVE TWO
-0.9031 ONE ONE
-0.9031 ONE ONE EIGHT
-0.6021 ONE ONE ONE
-0.6021 ONE SEVEN SEVEN
-0.6021 ONE SEVEN TWO
-0.3010 ONE SIX TWO
-0.6021 SEVEN EIGHT EIGHT
-0.6021 SEVEN EIGHT SIX
-0.3010 SEVEN NINE THREE
-0.7782 SEVEN ONE FIVE
-0.4771 SEVEN ONE ONE
-0.6021 SEVEN SEVEN EIGHT
-0.6021 SEVEN SEVEN THREE
-0.9031 SEVEN THREE
-0.9031 SEVEN THREE FOUR
-0.6021 SEVEN THREE SIX
-0.6021 SEVEN TWO FOUR
-0.6021 SEVEN TWO ONE
-0.7782 SIX EIGHT EIGHT
-0.7782 SIX EIGHT FOUR
-0.7782 SIX EIGHT TWO
-0.3010 SIX FOUR
-0.6021 SIX ONE EIGHT
-0.6021 SIX ONE FIVE
-0.6021 SIX SEVEN
-0.6021 SIX SEVEN EIGHT
-0.3010 SIX SIX EIGHT
-0.9031 SIX THREE EIGHT
-0.9031 SIX THREE ONE
-0.9031 SIX THREE SEVEN
-0.9031 SIX THREE THREE
-0.6021 SIX TWO FIVE
-0.6021 SIX TWO SEVEN
-0.3010 THREE EIGHT
-0.6021 THREE FIVE FIVE
-0.6021 THREE FIVE ONE
-0.9031 THREE FOUR
-0.9031 THREE FOUR SIX
-0.9031 THREE FOUR THREE
-0.9031 THREE FOUR TWO
-0.6021 THREE NINE ONE
-0.6021 THREE NINE SIX
-0.3010 THREE ONE SEVEN
-0.3010 THREE SEVEN THREE
-0.9031 THREE SIX SEVEN
-0.4260 THREE SIX THREE
-0.7782 THREE THREE
-0.7782 THREE THREE FOUR
-0.7782 THREE THREE SIX
-0.3010 THREE TWO FIVE
-0.3010 TWO EIGHT FOUR
-0.4771 TWO FIVE EIGHT
-0.7782 TWO FIVE SEVEN
-0.3010 TWO FOUR NINE
-0.3010 TWO ONE
-0.6021 TWO SEVEN ONE
-0.6021 TWO SEVEN SEVEN
-0.3010 TWO THREE TWO
\end\
If you would like to refer to this comment somewhere else in this project, copy and paste the following link:
Fix will be available in nightly tarball soon. You can use lm3g2dmp for a
while/
If you would like to refer to this comment somewhere else in this project, copy and paste the following link:
Anonymous
-
2010-03-29
Okay cool thanks!
I just got the nightly sphinxbase dated 2010-03-29
make had this to say about it...
Making all in python
make: Entering directory /cygdrive/c/Library/Java/sphinxbase_2010-03-29/python'
test -z "" || cp "./sphinxbase.c" sphinxbase.c
/usr/bin/python setup.py build
running build
running build_ext
building 'sphinxbase' extension
creating build
creating build/temp.cygwin-1.5.25-i686-2.5
gcc -fno-strict-aliasing -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes
-I../include -I../include -I/usr/include/pyth
on2.5 -c sphinxbase.c -o build/temp.cygwin-1.5.25-i686-2.5/sphinxbase.o
sphinxbase.c: In functionpyx_pf_10sphinxbase_10NGramModel_get_counts':
sphinxbase.c:1717: warning: assignment discards qualifiers from pointer target
type
sphinxbase.c: In function __pyx_f_10sphinxbase_9NGramIter_set_iter':
sphinxbase.c:2848: warning: assignment discards qualifiers from pointer target
type
sphinxbase.c: In function__pyx_pf_10sphinxbase_8HuffCode_decode':
sphinxbase.c:4776: warning: passing arg 2 of huff_code_decode_str' from
incompatible pointer type
sphinxbase.c:4776: warning: assignment discards qualifiers from pointer target
type
sphinxbase.c: In function__pyx_pf_10sphinxbase_8HuffCode_decode_from_file':
sphinxbase.c:5227: warning: assignment discards qualifiers from pointer target
type
sphinxbase.c: At top level:
sphinxbase.c:707: warning: '__pyx_doc_10sphinxbase_7LogMath___cinit' defined
but not used
sphinxbase.c:805: warning: 'pyx_doc_10sphinxbase_7LogMath___dealloc'
defined but not used
sphinxbase.c:1216: warning: 'pyx_doc_10sphinxbase_10NGramModel___cinit'
defined but not used
sphinxbase.c:1491: warning: 'pyx_doc_10sphinxbase_10NGramModel___dealloc'
defined but not used
sphinxbase.c:3502: warning: 'pyx_doc_10sphinxbase_8HuffCode___init'
defined but not used
creating build/lib.cygwin-1.5.25-i686-2.5
gcc -shared -Wl,--enable-auto-image-base
build/temp.cygwin-1.5.25-i686-2.5/sphinxbase.o -L../src/libsphinxbase/.libs
-L/
usr/lib/python2.5/config -lsphinxbase -lpython2.5 -o
build/lib.cygwin-1.5.25-i686-2.5/sphinxbase.dll
../src/libsphinxbase/.libs/libsphinxbase.a(ngram_model.o): In function ngram_model_recode':
/cygdrive/c/Library/Java/sphinxbase_2010-03-29/src/libsphinxbase/lm/ngram_mode
l.c:359: undefined reference to_libiconv
_open'
/cygdrive/c/Library/Java/sphinxbase_2010-03-29/src/libsphinxbase/lm/ngram_mode
l.c:407: undefined reference to _libiconv
'
/cygdrive/c/Library/Java/sphinxbase_2010-03-29/src/libsphinxbase/lm/ngram_mode
l.c:413: undefined reference to_libiconv
'
/cygdrive/c/Library/Java/sphinxbase_2010-03-29/src/libsphinxbase/lm/ngram_mode
l.c:417: undefined reference to _libiconv
'
/cygdrive/c/Library/Java/sphinxbase_2010-03-29/src/libsphinxbase/lm/ngram_mode
l.c:458: undefined reference to_libiconv
_close'
/cygdrive/c/Library/Java/sphinxbase_2010-03-29/src/libsphinxbase/lm/ngram_mode
l.c:427: undefined reference to _libiconv
'
collect2: ld returned 1 exit status
error: command 'gcc' failed with exit status 1
make: *** Error 1
make: Leaving directory/cygdrive/c/Library/Java/sphinxbase_2010-03-29/python'
make: *** Error 1
If you would like to refer to this comment somewhere else in this project, copy and paste the following link:
Anonymous
-
2010-03-29
Despite the 'make' error it did create a sphinx_lm_convert, which has
converted my lm to dmp finally, thanks.
I'm starting a new thread on Config.xml stuff next I suppose
If you would like to refer to this comment somewhere else in this project, copy and paste the following link:
Okay so now I tried running cygwin built sphinx_lm_convert on Vista but being
my first time somethings wrong or unclear...
C:\Library\Java\SphinxTrain-1.0\tutorial\TestNumbers\etc>C:\Library\Java\sphin
xbase-0.6\src\sphinx_lmtools.libs\sphin
x_lm_convert.exe -i TestNumbers.lm -o TestNumbers.lm.dmp
INFO: cmd_ln.c(512): Parsing command line:
/cygdrive/c/Library/Java/sphinxbase-0.6/src/sphinx_lmtools/.libs/sphinx_lm_con
vert \
-i TestNumbers.lm \
-o TestNumbers.lm.dmp
Current configuration:
-case
-debug 0
-help no no
-i TestNumbers.lm
-ienc
-ifmt
-logbase 1.0001 1.000100e+00
-mmap no no
-o TestNumbers.lm.dmp
-oenc utf8 utf8
-ofmt
INFO: ngram_model_arpa.c(476): ngrams 1=11, 2=81, 3=146
INFO: ngram_model_arpa.c(135): Reading unigrams
INFO: ngram_model_arpa.c(515): 11 = #unigrams created
INFO: ngram_model_arpa.c(194): Reading bigrams
INFO: ngram_model_arpa.c(531): 81 = #bigrams created
INFO: ngram_model_arpa.c(532): 32 = #prob2 entries
INFO: ngram_model_arpa.c(539): 46 = #bo_wt2 entries
INFO: ngram_model_arpa.c(291): Reading trigrams
INFO: ngram_model_arpa.c(552): 146 = #trigrams created
INFO: ngram_model_arpa.c(553): 10 = #prob3 entries
INFO: ngram_model_dmp.c(492): Building DMP model...
INFO: ngram_model_dmp.c(522): 11 = #unigrams created
assertion "tgptr - model->lm3g.trigrams < newbase->n_counts" failed: file
"ngram_model_dmp.c", line 594
27394 sphinx_lm_convert 5656 open_stackdumpfile: Dumping stack trace to
sphinx_lm_convert.exe.stackdump
198246 sphinx_lm_convert 5656 _cygtls::handle_exceptions: Exception:
STATUS_ACCESS_VIOLATION
279379 sphinx_lm_convert 5656 _cygtls::handle_exceptions: Exception:
STATUS_ACCESS_VIOLATION
283992 sphinx_lm_convert 5656 _cygtls::handle_exceptions: Error while dumping
state (probably corrupted stack)
Here's the crash report sphinx_lm_convert.exe.stackdump
Stack trace:
Frame Function Args
0027C768 770311D8 (0000012C, 0000EA60, 000000A4, 0027C7B0)
0027C888 61097F54 (00000000, 770312A0, 776A04A2, 000000A4)
0027C978 61095AEB (00000000, 0053002B, 002B002B, 0027CE68)
0027C9D8 61095FCB (0027C9F0, 00000000, 00000094, 61020C1B)
0027CA98 61096182 (00001618, 00000006, 0027CAC8, 61096383)
0027CAA8 610961AC (00000006, 0027CE88, 995F0000, 6109A7DF)
0027CAC8 61096383 (6110D008, 00415BE0, 00415779, 00000252)
0027CAF8 61001087 (00415779, 00000252, 00415BE0, 00000000)
0027CB88 610935A8 (00FC22F8, 00000002, 0027CBB8, 0041408D)
0027CBD8 0040B117 (00FC22F8, 00FC2D68, 00000002, 00FC2070)
0027CC28 00401267 (00000005, 00FC0178, 00FC0090, 61002AF2)
0027CD68 610060D8 (00000000, 0027CDA0, 61005450, 0027CDA0)
61005450 61004416 (0000009C, A02404C7, E8611021, FFFFFF48)
198246 sphinx_lm_convert 5656 _cygtls::handle_exceptions: Exception:
STATUS_ACCESS_VIOLATION
Exception: STATUS_ACCESS_VIOLATION at eip=61016583
eax=EC815356 ebx=61108148 ecx=00000000 edx=57E58959 esi=0000000D edi=00000001
ebp=00BBC898 esp=00BBC890 program=C:\Library\Java\sphinxbase-0.6\src\sphinx_lm
tools.libs\sphinx_lm_convert.exe, pid 5656, thread sig
cs=0023 ds=002B es=002B fs=0053 gs=002B ss=002B
Stack trace:
Frame Function Args
00BBC898 61016583 (61108148, 6111C19B, FFFFFF48, 00000000)
00BBC8B8 610166EC (00000001, 00000000, 00000000, 00BBC940)
00BBC8F8 61017FD5 (00000058, 00BBC940, 00000000, 00000000)
00BBCC38 61018638 (00000000, 00BBCC70, 000000A4, 00BBCC6C)
00BBCD38 61099F57 (61106F00, 00000000, 00000000, 00000000)
00BBCD68 61002F32 (00BBCE64, 61018970, 00001074, 00000000)
61003650 61003769 (04A16430, 89000000, FFDA90B0, 24468BFF)
279379 sphinx_lm_convert 5656 _cygtls::handle_exceptions: Exception:
STATUS_ACCESS_VIOLATION
283992 sphinx_lm_convert 5656 _cygtls::handle_exceptions: Error while dumping
state (probably corrupted stack)
Offending line of code is...
assert(tgptr - model->lm3g.trigrams < newbase->n_counts);
in file for sphinxbase 0-6 called ngram_model_dmp.c
ngram_model_dmp_t
ngram_model_dmp_build(ngram_model_t base)
{
ngram_model_dmp_t model;
ngram_model_t newbase;
ngram_iter_t itor;
sorted_list_t sorted_prob2;
sorted_list_t sorted_bo_wt2;
sorted_list_t sorted_prob3;
bigram_t bgptr;
trigram_t *tgptr;
int i, bgcount, tgcount, seg;
if (base->funcs == &ngram_model_dmp_funcs) {
E_INFO("Using existing DMP model.\n");
return (ngram_model_dmp_t *)ngram_model_retain(base);
}
/ Initialize new base model structure with params from base. /
E_INFO("Building DMP model...\n");
model = ckd_calloc(1, sizeof(model));
newbase = &model->base;
ngram_model_init(newbase, &ngram_model_dmp_funcs,
logmath_retain(base->lmath),
base->n, base->n_counts);
/ Copy N-gram counts over. /
memcpy(newbase->n_counts, base->n_counts,
base->n * sizeof(base->n_counts));
/ Make sure word strings are freed. /
newbase->writable = TRUE;
/ Initialize unigram table and string table. /
model->lm3g.unigrams = new_unigram_table(newbase->n_counts + 1);
for (itor = ngram_model_mgrams(base, 0); itor;
itor = ngram_iter_next(itor)) {
int32 prob1, bo_wt1;
int32 const *wids;
/ Can't guarantee they will go in unigram order, so just to
be correct, we do this... */
wids = ngram_iter_get(itor, &prob1, &bo_wt1);
model->lm3g.unigrams[wids].prob1.l = prob1;
model->lm3g.unigrams[wids].bo_wt1.l = bo_wt1;
newbase->word_str[wids] = ckd_salloc(ngram_word(base, wids));
if ((hash_table_enter_int32(newbase->wid,
newbase->word_str[wids], wids))
!= wids) {
E_WARN("Duplicate word in dictionary: %s\n", newbase->word_str[wids]);
}
}
E_INFO("%8d = #unigrams created\n", newbase->n_counts);
/ Construct quantized probability table for bigrams and
(optionally) trigrams. Hesitate to use the "sorted list" thing
since it isn't so useful, but it's there already. /
init_sorted_list(&sorted_prob2);
if (newbase->n > 2) {
init_sorted_list(&sorted_bo_wt2);
init_sorted_list(&sorted_prob3);
}
/ Construct bigram and trigram arrays. /
bgptr = model->lm3g.bigrams = ckd_calloc(newbase->n_counts + 1,
sizeof(bigram_t));
if (newbase->n > 2) {
tgptr = model->lm3g.trigrams = ckd_calloc(newbase->n_counts,
sizeof(trigram_t));
model->lm3g.tseg_base =
ckd_calloc((newbase->n_counts + 1) / BG_SEG_SZ + 1, sizeof(int32));
}
else
tgptr = NULL;
/ Since bigrams and trigrams have to be contiguous with others
with the same N-1-gram, we traverse them in depth-first order
to build the bigram and trigram arrays. /
for (i = 0; i < newbase->n_counts; ++i) {
ngram_iter_t uitor;
bgcount = bgptr - model->lm3g.bigrams;
/ First bigram index (same as next if no bigrams...) /
model->lm3g.unigrams_.bigrams = bgcount;
E_DEBUG(2, ("unigram %d: %s => bigram %d\n", i, newbase->word_str_, bgcount));
/ All bigrams corresponding to unigram i /
uitor = ngram_ng_iter(base, i, NULL, 0);
for (itor = ngram_iter_successors(uitor);
itor; ++bgptr, itor = ngram_iter_next(itor)) {
int32 prob2, bo_wt2;
int32 const wids;
ngram_iter_t *titor;
wids = ngram_iter_get(itor, &prob2, &bo_wt2);
/ FIXME FIXME FIXME: not sure why this happens... /
if (bgptr - model->lm3g.bigrams >= newbase->n_counts) {
ngram_iter_free(itor);
break;
}
bgptr->wid = wids;
bgptr->prob2 = sorted_id(&sorted_prob2, &prob2);
if (newbase->n > 2) {
tgcount = (tgptr - model->lm3g.trigrams);
/ Backoff weight (only if there are trigrams...) /
bgptr->bo_wt2 = sorted_id(&sorted_bo_wt2, &bo_wt2);
/ Find bigram segment for this bigram (this isn't
used unless there are trigrams) /
seg = bgcount >> LOG_BG_SEG_SZ;
/ If we just crossed a bigram segment boundary, then
point tseg_base for the new segment to the current
trigram pointer. /
if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
model->lm3g.tseg_base = tgcount;
/ Now calculate the trigram offset. */
bgptr->trigrams = tgcount - model->lm3g.tseg_base;
E_DEBUG(2, ("bigram %d %s %s => trigram %d:%d\n",
bgcount,
newbase->word_str[wids],
newbase->word_str[wids],
seg, bgptr->trigrams));
/ And fill in successors' trigram info. /
for (titor = ngram_iter_successors(itor);
titor; ++tgptr, titor = ngram_iter_next(titor)) {
int32 prob3, dummy;
assert(tgptr - model->lm3g.trigrams < newbase->n_counts);
wids = ngram_iter_get(titor, &prob3, &dummy);
tgptr->wid = wids;
tgptr->prob3 = sorted_id(&sorted_prob3, &prob3);
E_DEBUG(2, ("trigram %d %s %s %s => prob %d\n",
tgcount,
newbase->word_str[wids],
newbase->word_str[wids],
newbase->word_str[wids],
tgptr->prob3));
}
}
}
ngram_iter_free(uitor);
}
/ Add sentinal unigram and bigram records. /
bgcount = bgptr - model->lm3g.bigrams;
tgcount = tgptr - model->lm3g.trigrams;
seg = bgcount >> LOG_BG_SEG_SZ;
if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
model->lm3g.tseg_base = tgcount;
model->lm3g.unigrams_.bigrams = bgcount;
bgptr->trigrams = tgcount - model->lm3g.tseg_base;
/ Now create probability tables. /
model->lm3g.n_prob2 = sorted_prob2.free;
model->lm3g.prob2 = vals_in_sorted_list(&sorted_prob2);
E_INFO("%8d = #bigrams created\n", newbase->n_counts);
E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2);
free_sorted_list(&sorted_prob2);
if (newbase->n > 2) {
/ Create trigram bo-wts array. /
model->lm3g.n_bo_wt2 = sorted_bo_wt2.free;
model->lm3g.bo_wt2 = vals_in_sorted_list(&sorted_bo_wt2);
free_sorted_list(&sorted_bo_wt2);
E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2);
/ Create trigram probability table. /
model->lm3g.n_prob3 = sorted_prob3.free;
model->lm3g.prob3 = vals_in_sorted_list(&sorted_prob3);
E_INFO("%8d = #trigrams created\n", newbase->n_counts);
E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3);
free_sorted_list(&sorted_prob3);
/ Initialize tginfo /
model->lm3g.tginfo = ckd_calloc(newbase->n_counts, sizeof(tginfo_t *));
model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
}
return model;
}
You probably want to provide lm file that causes the problem.
I've tried various things there and various settings to the limited help
supported usage for sphinx_lm_convert
The Silence tages I've made upper case in the filler and built the acoustic
model from it okay.
I've tried arguments in the converter like mapping and setting debug levels,
and anything I can the formats and possible settings there is an unknown (not
shown with help).
I still suspect the newline variations in windows to unix, although that has
never proven to be an issue here.
I've also used the new online model generator and downloaded the tar and tried
those files directly but still the same stuff and problem.
Here's the simple number only (no zero) LM file that causes the crash.
Language model created by QuickLM on Sun Mar 28 05:36:22 EDT 2010
Copyright (c) 1996-2010 Carnegie Mellon University and Alexander I. Rudnicky
The model is in standard ARPA format, designed by Doug Paul while he was at
MITRE.
The code that was used to produce this language model is available in Open
Source.
Please visit
http://www.speech.cs.cmu.edu/tools/ for
more information
The (fixed) discount mass is 0.5. The backoffs are computed using the ratio
method.
This model based on a corpus of 20 sentences and 11 words
\data\
ngram 1=11
ngram 2=81
ngram 3=146
\1-grams:
-1.3010 -0.3010
-1.3010
-0.0792-1.1871 EIGHT -0.0414
-1.4260 FIVE -0.0792
-1.3010 FOUR -0.1303
-1.5229 NINE -0.0917
-1.3233 ONE -0.1569
-1.3979 SEVEN -0.1335
-1.3716 SIX -0.0917
-1.2596 THREE -0.0414
-1.4881 TWO -0.1173
\2-grams:
-1.0000
EIGHT -0.2285-0.3010-1.3010
FIVE -0.2389-0.3010-1.0000
FOUR -0.1139-0.3010-1.3010
NINE -0.2218-0.3010-1.1249
ONE -0.2264-0.3010-1.6021
SEVEN -0.2583-0.3010-1.6021
SIX -0.2467-0.3010-1.3010
THREE -0.2374-0.3010-1.6021
TWO -0.2663-0.3010-1.4150 EIGHT
-0.9379 EIGHT EIGHT -0.1413
-1.7160 EIGHT FIVE -0.2711
-1.0170 EIGHT FOUR -0.1614
-1.4150 EIGHT NINE -0.2430
-1.4150 EIGHT ONE -0.1836
-1.4150 EIGHT SEVEN -0.2109
-1.4150 EIGHT SIX -0.2467
-1.4150 EIGHT THREE -0.2486
-1.4150 EIGHT TWO -0.1648
-1.4771 FIVE
-1.0000 FIVE EIGHT -0.1761
-1.1761 FIVE FIVE -0.2553
-1.4771 FIVE NINE -0.2825
-1.1761 FIVE ONE -0.2398
-1.1761 FIVE SEVEN -0.1938
-1.4771 FIVE SIX -0.2747
-1.1761 FIVE THREE -0.2374
-1.4771 FIVE TWO -0.2285
-1.1249 FOUR
-1.6021 FOUR EIGHT -0.2840
-1.0000 FOUR FOUR -0.1903
-1.0000 FOUR NINE -0.1513
-1.0000 FOUR SIX -0.1845
-1.3010 FOUR THREE -0.2374
-1.3010 FOUR TWO -0.2083
-1.3802 NINE
-1.3802 NINE EIGHT -0.2478
-1.3802 NINE FOUR -0.2788
-1.3802 NINE ONE -0.2527
-1.0792 NINE SEVEN -0.2583
-1.0792 NINE SIX -0.2609
-1.0792 NINE THREE -0.2704
-1.0792 NINE TWO -0.2083
-1.1027 ONE
-0.9777 ONE EIGHT -0.1532
-0.8808 ONE FIVE -0.2041
-0.9777 ONE ONE -0.1526
-1.2788 ONE SEVEN -0.2430
-1.5798 ONE SIX -0.2747
-1.2041 SEVEN
-1.2041 SEVEN EIGHT -0.2285
-1.5051 SEVEN NINE -0.2632
-1.0280 SEVEN ONE -0.1836
-1.2041 SEVEN SEVEN -0.2109
-0.9031 SEVEN THREE -0.1761
-1.2041 SEVEN TWO -0.2663
-1.5315 SIX
-1.0544 SIX EIGHT -0.1761
-1.5315 SIX FOUR -0.2672
-1.2304 SIX ONE -0.1836
-1.2304 SIX SEVEN -0.2430
-1.2304 SIX SIX -0.2609
-0.9294 SIX THREE -0.2374
-1.2304 SIX TWO -0.2083
-1.1663 THREE
-1.6435 THREE EIGHT -0.2840
-1.3424 THREE FIVE -0.2389
-1.0414 THREE FOUR -0.1614
-1.3424 THREE NINE -0.2430
-1.6435 THREE ONE -0.2775
-1.6435 THREE SEVEN -0.2430
-1.0414 THREE SIX -0.2167
-1.1663 THREE THREE -0.1761
-1.6435 THREE TWO -0.2478
-0.8129 TWO
-1.4150 TWO EIGHT -0.2571
-0.9379 TWO FIVE -0.2218
-1.4150 TWO FOUR -0.2553
-1.4150 TWO ONE -0.2653
-1.1139 TWO SEVEN -0.2272
-1.4150 TWO THREE -0.2910
\3-grams:
-0.9031
EIGHT NINE-0.9031
EIGHT SEVEN-0.9031
EIGHT SIX-0.9031
EIGHT THREE-0.6021
FIVE EIGHT-0.6021
FIVE SIX-0.9031
FOUR FOUR-0.9031
FOUR NINE-0.9031
FOUR SIX-0.9031
FOUR THREE-0.6021
NINE SEVEN-0.6021
NINE TWO-0.4771
ONE FIVE-0.7782
ONE SIX-0.3010
SEVEN ONE-0.3010
SIX THREE-0.6021
THREE FIVE-0.6021
THREE FOUR-0.3010
TWO SEVEN-1.0792 EIGHT EIGHT EIGHT
-1.0792 EIGHT EIGHT FIVE
-1.0792 EIGHT EIGHT FOUR
-0.7782 EIGHT EIGHT ONE
-1.0792 EIGHT EIGHT TWO
-0.3010 EIGHT FIVE FIVE
-1.0000 EIGHT FOUR
-0.6990 EIGHT FOUR FOUR
-0.6990 EIGHT FOUR NINE
-0.6021 EIGHT NINE EIGHT
-0.6021 EIGHT NINE THREE
-0.6021 EIGHT ONE EIGHT
-0.6021 EIGHT ONE FIVE
-0.6021 EIGHT SEVEN
-0.6021 EIGHT SEVEN THREE
-0.6021 EIGHT SIX SEVEN
-0.6021 EIGHT SIX SIX
-0.6021 EIGHT THREE
-0.6021 EIGHT THREE NINE
-0.6021 EIGHT TWO
-0.6021 EIGHT TWO FIVE
-0.7782 FIVE EIGHT EIGHT
-0.7782 FIVE EIGHT FOUR
-0.7782 FIVE EIGHT THREE
-0.6021 FIVE FIVE NINE
-0.6021 FIVE FIVE SEVEN
-0.3010 FIVE NINE
-0.6021 FIVE ONE
-0.6021 FIVE ONE SEVEN
-0.6021 FIVE SEVEN ONE
-0.6021 FIVE SEVEN THREE
-0.3010 FIVE SIX TWO
-0.6021 FIVE THREE FOUR
-0.6021 FIVE THREE NINE
-0.3010 FIVE TWO
-0.3010 FOUR EIGHT
-0.9031 FOUR FOUR EIGHT
-0.9031 FOUR FOUR FOUR
-0.6021 FOUR FOUR SIX
-0.9031 FOUR NINE FOUR
-0.9031 FOUR NINE SEVEN
-0.9031 FOUR NINE SIX
-0.9031 FOUR NINE TWO
-0.9031 FOUR SIX
-0.9031 FOUR SIX EIGHT
-0.9031 FOUR SIX ONE
-0.9031 FOUR SIX SIX
-0.6021 FOUR THREE FIVE
-0.6021 FOUR THREE SIX
-0.6021 FOUR TWO
-0.6021 FOUR TWO EIGHT
-0.3010 NINE EIGHT EIGHT
-0.3010 NINE FOUR TWO
-0.3010 NINE ONE EIGHT
-0.6021 NINE SEVEN NINE
-0.6021 NINE SEVEN TWO
-0.6021 NINE SIX FOUR
-0.6021 NINE SIX ONE
-0.3010 NINE THREE THREE
-0.6021 NINE TWO
-0.6021 NINE TWO THREE
-0.9031 ONE EIGHT EIGHT
-0.9031 ONE EIGHT FOUR
-0.9031 ONE EIGHT NINE
-0.9031 ONE EIGHT SEVEN
-1.0000 ONE FIVE
-1.0000 ONE FIVE ONE
-0.6990 ONE FIVE THREE
-1.0000 ONE FIVE TWO
-0.9031 ONE ONE
-0.9031 ONE ONE EIGHT
-0.6021 ONE ONE ONE
-0.6021 ONE SEVEN SEVEN
-0.6021 ONE SEVEN TWO
-0.3010 ONE SIX TWO
-0.6021 SEVEN EIGHT EIGHT
-0.6021 SEVEN EIGHT SIX
-0.3010 SEVEN NINE THREE
-0.7782 SEVEN ONE FIVE
-0.4771 SEVEN ONE ONE
-0.6021 SEVEN SEVEN EIGHT
-0.6021 SEVEN SEVEN THREE
-0.9031 SEVEN THREE
-0.9031 SEVEN THREE FOUR
-0.6021 SEVEN THREE SIX
-0.6021 SEVEN TWO FOUR
-0.6021 SEVEN TWO ONE
-0.7782 SIX EIGHT EIGHT
-0.7782 SIX EIGHT FOUR
-0.7782 SIX EIGHT TWO
-0.3010 SIX FOUR
-0.6021 SIX ONE EIGHT
-0.6021 SIX ONE FIVE
-0.6021 SIX SEVEN
-0.6021 SIX SEVEN EIGHT
-0.3010 SIX SIX EIGHT
-0.9031 SIX THREE EIGHT
-0.9031 SIX THREE ONE
-0.9031 SIX THREE SEVEN
-0.9031 SIX THREE THREE
-0.6021 SIX TWO FIVE
-0.6021 SIX TWO SEVEN
-0.3010 THREE EIGHT
-0.6021 THREE FIVE FIVE
-0.6021 THREE FIVE ONE
-0.9031 THREE FOUR
-0.9031 THREE FOUR SIX
-0.9031 THREE FOUR THREE
-0.9031 THREE FOUR TWO
-0.6021 THREE NINE ONE
-0.6021 THREE NINE SIX
-0.3010 THREE ONE SEVEN
-0.3010 THREE SEVEN THREE
-0.9031 THREE SIX SEVEN
-0.4260 THREE SIX THREE
-0.7782 THREE THREE
-0.7782 THREE THREE FOUR
-0.7782 THREE THREE SIX
-0.3010 THREE TWO FIVE
-0.3010 TWO EIGHT FOUR
-0.4771 TWO FIVE EIGHT
-0.7782 TWO FIVE SEVEN
-0.3010 TWO FOUR NINE
-0.3010 TWO ONE
-0.6021 TWO SEVEN ONE
-0.6021 TWO SEVEN SEVEN
-0.3010 TWO THREE TWO
\end\K, thanks. This is the bug in sphinxbase that is reproduced on linux as well.
Let me fix it.
This was fixed in sphinxbase trunk.
[http://cmusphinx.svn.sourceforge.net/viewvc/cmusphinx?view=rev&revision=9934]
(http://cmusphinx.svn.sourceforge.net/viewvc/cmusphinx?view=rev&revision=9934)
Fix will be available in nightly tarball soon. You can use lm3g2dmp for a
while/
Okay cool thanks!
I just got the nightly sphinxbase dated 2010-03-29
make had this to say about it...
Making all in python
make: Entering directory
/cygdrive/c/Library/Java/sphinxbase_2010-03-29/python' test -z "" || cp "./sphinxbase.c" sphinxbase.c /usr/bin/python setup.py build running build running build_ext building 'sphinxbase' extension creating build creating build/temp.cygwin-1.5.25-i686-2.5 gcc -fno-strict-aliasing -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -I../include -I../include -I/usr/include/pyth on2.5 -c sphinxbase.c -o build/temp.cygwin-1.5.25-i686-2.5/sphinxbase.o sphinxbase.c: In function
pyx_pf_10sphinxbase_10NGramModel_get_counts':sphinxbase.c:1717: warning: assignment discards qualifiers from pointer target
type
sphinxbase.c: In function
__pyx_f_10sphinxbase_9NGramIter_set_iter': sphinxbase.c:2848: warning: assignment discards qualifiers from pointer target type sphinxbase.c: In function
__pyx_pf_10sphinxbase_8HuffCode_decode':sphinxbase.c:4776: warning: passing arg 2 of
huff_code_decode_str' from incompatible pointer type sphinxbase.c:4776: warning: assignment discards qualifiers from pointer target type sphinxbase.c: In function
__pyx_pf_10sphinxbase_8HuffCode_decode_from_file':sphinxbase.c:5227: warning: assignment discards qualifiers from pointer target
type
sphinxbase.c: At top level:
sphinxbase.c:707: warning: '__pyx_doc_10sphinxbase_7LogMath___cinit' defined
but not used
sphinxbase.c:805: warning: 'pyx_doc_10sphinxbase_7LogMath___dealloc'
defined but not used
sphinxbase.c:1216: warning: 'pyx_doc_10sphinxbase_10NGramModel___cinit'
defined but not used
sphinxbase.c:1491: warning: 'pyx_doc_10sphinxbase_10NGramModel___dealloc'
defined but not used
sphinxbase.c:3502: warning: 'pyx_doc_10sphinxbase_8HuffCode___init'
defined but not used
creating build/lib.cygwin-1.5.25-i686-2.5
gcc -shared -Wl,--enable-auto-image-base
build/temp.cygwin-1.5.25-i686-2.5/sphinxbase.o -L../src/libsphinxbase/.libs
-L/
usr/lib/python2.5/config -lsphinxbase -lpython2.5 -o
build/lib.cygwin-1.5.25-i686-2.5/sphinxbase.dll
../src/libsphinxbase/.libs/libsphinxbase.a(ngram_model.o): In function
ngram_model_recode': /cygdrive/c/Library/Java/sphinxbase_2010-03-29/src/libsphinxbase/lm/ngram_mode l.c:359: undefined reference to
_libiconv_open'
/cygdrive/c/Library/Java/sphinxbase_2010-03-29/src/libsphinxbase/lm/ngram_mode
l.c:407: undefined reference to
_libiconv ' /cygdrive/c/Library/Java/sphinxbase_2010-03-29/src/libsphinxbase/lm/ngram_mode l.c:413: undefined reference to
_libiconv'
/cygdrive/c/Library/Java/sphinxbase_2010-03-29/src/libsphinxbase/lm/ngram_mode
l.c:417: undefined reference to
_libiconv ' /cygdrive/c/Library/Java/sphinxbase_2010-03-29/src/libsphinxbase/lm/ngram_mode l.c:458: undefined reference to
_libiconv_close'
/cygdrive/c/Library/Java/sphinxbase_2010-03-29/src/libsphinxbase/lm/ngram_mode
l.c:427: undefined reference to
_libiconv ' collect2: ld returned 1 exit status error: command 'gcc' failed with exit status 1 make: *** Error 1 make: Leaving directory
/cygdrive/c/Library/Java/sphinxbase_2010-03-29/python'make: *** Error 1
Despite the 'make' error it did create a sphinx_lm_convert, which has
converted my lm to dmp finally, thanks.
I'm starting a new thread on Config.xml stuff next I suppose
This cygwin build problem was fixed in svn trunk