I finally set up node-pocketsphinx and I was hoping I could use this to get phonemes with timestamps.
I looked around, and it seems you need to set -allphone to the phone path. Then you need to use decoder.seg(), which is meant to return phonemes and timestamps. However, I tried using decoder.seg(), and it always returned nothing. I think this was because it got called before the decoder finished, so I tried wrapping everything in a promise, but this got called early as well.
Then I re-read the code posted on that question, and I realised he never even set -allphone, yet it still appears to return phonemes. What is the advantage of setting -allphone if it returns phonemes either way? EDIT: it doesn't work unless you use -allphone
var fs = require('fs');
var ps = require('..').ps;
modeldir = "../../pocketsphinx/model/en-us/"
var config = new ps.Decoder.defaultConfig();
config.setString("-hmm", modeldir + "en-us");
config.setString("-dict", modeldir + "cmudict-en-us.dict");
config.setString("-allphone", modeldir + "en-us-phone.lm.bin");
var decoder = new ps.Decoder(config);
fs.readFile("../../pocketsphinx/test/data/goforward.raw", function(err, data) {
if (err) throw err;
decoder.startUtt();
decoder.processRaw(data, false, false);
decoder.endUtt();
console.log(decoder.hyp())
it = decoder.seg().iter()
while ((seg = it.next()) != null) {
console.log(seg.word, seg.startFrame, seg.endFrame);
}
});
_exports_Hypothesis {
prob: 0,
bestScore: -5813,
hypstr: 'SIL T OW F AO ER T AE NG IY D ER S SIL' }
SIL 0 45
T 46 51
OW 52 63
F 64 78
AO 79 93
ER 94 113
T 114 130
AE 131 139
NG 140 156
IY 157 169
D 170 172
ER 173 190
S 191 211
SIL 212 260
If you would like to refer to this comment somewhere else in this project, copy and paste the following link:
I finally set up node-pocketsphinx and I was hoping I could use this to get phonemes with timestamps.
I looked around, and it seems you need to set -allphone to the phone path.
Then you need to use decoder.seg(), which is meant to return phonemes and timestamps. However, I tried using decoder.seg(), and it always returned nothing. I think this was because it got called before the decoder finished, so I tried wrapping everything in a promise, but this got called early as well.
Then I re-read the code posted on that question, and I realised he never even set -allphone, yet it still appears to return phonemes. What is the advantage of setting -allphone if it returns phonemes either way? EDIT: it doesn't work unless you use -allphone
Then I read that you could align timestamps using the alignment API. This confused me even more, because I thought that decoder.seg() already did that.
I'm probably just stupid but I'm a bit lost as to what to do, and the difference between things.
This is the current code, based on the python phoneme example:
const fs = require("fs"); const ps = require("pocketsphinx").ps; const model = "source/pocketsphinx/model/en-us/"; let config = new ps.Decoder.defaultConfig(); config.setString("-hmm", model + "en-us"); config.setString("-dict", model + "cmudict-en-us.dict"); config.setString("-lm", model + "en-us.lm.bin"); config.setString("-allphone", model + "en-us-phone.lm.bin"); config.setFloat("-lw", 2.0); config.setFloat("-pip", 0.3); config.setFloat("-beam", 1e-200); config.setFloat("-pbeam", 1e-20); config.setBoolean("-mmap", false); let decoder = new ps.Decoder(config); fs.readFile("test.wav", function(err, data) { if (err) throw err; decoder.startUtt(); decoder.processRaw(data, false, false); decoder.endUtt(); console.log(decoder.hyp()); });It works, but with no timestamps. It'd be fantastic if someone could tell me what I should do.
Last edit: MysteryPancake 2018-01-29
This code outputs timestamps:
var fs = require('fs'); var ps = require('..').ps; modeldir = "../../pocketsphinx/model/en-us/" var config = new ps.Decoder.defaultConfig(); config.setString("-hmm", modeldir + "en-us"); config.setString("-dict", modeldir + "cmudict-en-us.dict"); config.setString("-allphone", modeldir + "en-us-phone.lm.bin"); var decoder = new ps.Decoder(config); fs.readFile("../../pocketsphinx/test/data/goforward.raw", function(err, data) { if (err) throw err; decoder.startUtt(); decoder.processRaw(data, false, false); decoder.endUtt(); console.log(decoder.hyp()) it = decoder.seg().iter() while ((seg = it.next()) != null) { console.log(seg.word, seg.startFrame, seg.endFrame); } });_exports_Hypothesis { prob: 0, bestScore: -5813, hypstr: 'SIL T OW F AO ER T AE NG IY D ER S SIL' } SIL 0 45 T 46 51 OW 52 63 F 64 78 AO 79 93 ER 94 113 T 114 130 AE 131 139 NG 140 156 IY 157 169 D 170 172 ER 173 190 S 191 211 SIL 212 260Thank you so much!
Last edit: MysteryPancake 2018-01-30