I am trying to decode .wav file to text and have finally managed to get a config file that (almost) works. However with every file I try to decode I get an error message that reads:
NonSpeechDataFilter: ALERT: getting a SpeechStartSignal while in speech, removing it.
also the decoding (whilst not bad in accuracy) seems to only start half way through my file (they are about 10s long).
Does anyone know whether these two issues could be related or have any advice on how to tweak the NonSpeechDataFilter or related components?
Make sure that you're audio file is encoded according to your StreamDataSource. If you want to avoid errors here, use the AudioFileDataSource which extracts the encoding automatically.
Additionally you should use a data-blocker before the speechClassifier which ensures that speech frames of defined size are use to determine speech activity.
-Holger
If you would like to refer to this comment somewhere else in this project, copy and paste the following link:
Hello Everyone.
I am trying to decode .wav file to text and have finally managed to get a config file that (almost) works. However with every file I try to decode I get an error message that reads:
NonSpeechDataFilter: ALERT: getting a SpeechStartSignal while in speech, removing it.
also the decoding (whilst not bad in accuracy) seems to only start half way through my file (they are about 10s long).
Does anyone know whether these two issues could be related or have any advice on how to tweak the NonSpeechDataFilter or related components?
Any help much appreciated
my config is:
<config>
<!-- ******** -->
<!-- frequently tuned properties -->
<!-- ******** -->
<property name="relativeBeamWidth" value="1E-60"/>
<property name="absoluteWordBeamWidth" value="22"/>
<property name="relativeWordBeamWidth" value="1E-30" />
<property name="wordInsertionProbability" value=".2" />
<property name="languageWeight" value="10.5" />
<property name="silenceInsertionProbability" value=".1" />
<property name="acousticLookahead" value="1.7" />
<property name="absoluteBeamWidth" value="-1" />
<property name="frontend" value="epFrontEnd"/>
<property name="recognizer" value="recognizer"/>
<property name="showCreations" value="false"/>
<property name="logLevel" value="OFF"/>
<!-- ******** -->
<!-- word recognizer configuration -->
<!-- ******** -->
<component name="recognizer" type="edu.cmu.sphinx.recognizer.Recognizer">
<property name="decoder" value="decoder"/>
<propertylist name="monitors">
<item>accuracyTracker </item>
<item>speedTracker </item>
<item>memoryTracker </item>
<item>recognizerMonitor </item>
<item>beamFinder</item>
</propertylist>
</component>
<component name="lexTreeLinguist" type="edu.cmu.sphinx.linguist.lextree.LexTreeLinguist">
<property name="silenceInsertionProbability" value="${silenceInsertionProbability}" />
<property name="wantUnigramSmear" value="true" />
<property name="fillerInsertionProbability" value=".02" />
<property name="addFillerWords" value="true" />
<property name="acousticModel" value="wsj" />
<property name="languageModel" value="trigramModel" />
<property name="wordInsertionProbability" value="${wordInsertionProbability}" />
<property name="languageWeight" value="14" />
<property name="logMath" value="logMath" />
<property name="dictionary" value="dictionary" />
<property name="unigramSmearWeight" value="1" />
<property name="cacheSize" value="0" />
<property name="generateUnitStates" value="false" />
<property name="unitManager" value="unitManager" />
</component>
<!-- ******** -->
<!-- The Decoder configuration -->
<!-- ******** -->
<component name="decoder" type="edu.cmu.sphinx.decoder.Decoder">
<property name="searchManager" value="wordPruningSearchManager"/>
</component>
-->
-->
-->
<property name="scorer" value="threadedScorer" />
<property name="pruner" value="trivialPruner" />
<property name="acousticLookaheadFrames" value="2.0" />
<property name="logMath" value="logMath" />
<property name="activeListManager" value="activeListManager" />
<property name="buildWordLattice" value="true" />
<property name="maxLatticeEdges" value ="50" />
<property name="relativeBeamWidth" value="1E-60" />
<property name="growSkipInterval" value="8" />
<property name="linguist" value="lexTreeLinguist" />
<property name="checkStateOrder" value="false" />
<property name="keepAllTokens" value="true" />
</component>
<component name="activeList"
type="edu.cmu.sphinx.decoder.search.PartitionActiveListFactory">
<property name="logMath" value="logMath"/>
<property name="absoluteBeamWidth" value="${absoluteBeamWidth}"/>
<property name="relativeBeamWidth" value="${relativeBeamWidth}"/>
</component>
<component name="trivialPruner"
type="edu.cmu.sphinx.decoder.pruner.SimplePruner"/>
<component name="threadedScorer"
type="edu.cmu.sphinx.decoder.scorer.ThreadedAcousticScorer">
<property name="frontend" value="${frontend}"/>
<property name="isCpuRelative" value="true"/>
<property name="numThreads" value="0"/>
<property name="minScoreablesPerThread" value="10"/>
<property name="scoreablesKeepFeature" value="true"/>
</component>
<!-- ******* -->
<!-- acoustic model -->
<!-- ******* -->
<component name="wsj" type="edu.cmu.sphinx.model.acoustic.WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz.Model">
<property name="loader" value="wsjLoader" />
<property name="unitManager" value="unitManager" />
</component>
<!-- ******* -->
<!-- sphinx3Loader -->
<!-- ******* -->
<component name="wsjLoader" type="edu.cmu.sphinx.model.acoustic.WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz.ModelLoader">
<property name="logMath" value="logMath" />
<property name="unitManager" value="unitManager" />
</component>
<!-- ******* -->
<!-- trigramModel -->
<!-- ******* -->
<component name="trigramModel"
type="edu.cmu.sphinx.linguist.language.ngram.large.LargeTrigramModel">
<property name="unigramWeight" value=".5"/>
<property name="maxDepth" value="3"/>
<property name="logMath" value="logMath"/>
<property name="dictionary" value="dictionary"/>
<property name="location"
value="C:\Documents and Settings\gbstanlr\Desktop\HUB4_trigram_lm\language_model.arpaformat.DMP"/>
</component>
<!-- ******** -->
<!-- The Dictionary configuration -->
<!-- ******** -->
<component name="dictionary"
type="edu.cmu.sphinx.linguist.dictionary.FastDictionary">
<property name="dictionaryPath"
value="resource:/edu.cmu.sphinx.model.acoustic.WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz.Model!/edu/cmu/sphinx/model/acoustic/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz/dict/cmudict.0.6d"/>
<property name="fillerPath"
value="resource:/edu.cmu.sphinx.model.acoustic.WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz.Model!/edu/cmu/sphinx/model/acoustic/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz/dict/fillerdict"/>
<property name="addSilEndingPronunciation" value="false"/>
<property name="wordReplacement" value="<sil>"/>
<property name="allowMissingWords" value="false"/>
<property name="unitManager" value="unitManager"/>
</component>
<!-- ******** -->
<!-- The unit manager configuration -->
<!-- ******** -->
<component name="unitManager"
type="edu.cmu.sphinx.linguist.acoustic.UnitManager"/>
<!-- ******** -->
<!-- The live frontend configuration -->
<!-- ******** -->
<component name="epFrontEnd" type="edu.cmu.sphinx.frontend.FrontEnd">
<propertylist name="pipeline">
<item>streamDataSource </item>
<item>speechClassifier </item>
<item>speechMarker </item>
<item>nonSpeechDataFilter </item>
<item>premphasizer </item>
<item>windower </item>
<item>fft </item>
<item>melFilterBank </item>
<item>dct </item>
<item>batchCMN </item>
<item>featureExtraction </item>
</propertylist>
</component>
<!-- ******** -->
<!-- The frontend pipelines -->
<!-- ******** -->
<component name="streamDataSource"
type="edu.cmu.sphinx.frontend.util.StreamDataSource">
<property name="sampleRate" value="16000"/>
<property name="bitsPerSample" value="16"/>
<property name="bigEndianData" value="false"/>
<property name="signedData" value="true"/>
<property name="bytesPerRead" value="320"/>
</component>
<component name="speechClassifier"
type="edu.cmu.sphinx.frontend.endpoint.SpeechClassifier">
<property name="threshold" value="16"/>
</component>
<component name="nonSpeechDataFilter"
type="edu.cmu.sphinx.frontend.endpoint.NonSpeechDataFilter"/>
<component name="speechMarker"
type="edu.cmu.sphinx.frontend.endpoint.SpeechMarker" >
<property name="speechTrailer" value="50"/>
</component>
<component name="premphasizer"
type="edu.cmu.sphinx.frontend.filter.Preemphasizer"/>
<component name="windower"
type="edu.cmu.sphinx.frontend.window.RaisedCosineWindower">
</component>
<component name="fft"
type="edu.cmu.sphinx.frontend.transform.DiscreteFourierTransform">
</component>
<component name="melFilterBank"
type="edu.cmu.sphinx.frontend.frequencywarp.MelFrequencyFilterBank">
</component>
<component name="dct"
type="edu.cmu.sphinx.frontend.transform.DiscreteCosineTransform"/>
<component name="batchCMN"
type="edu.cmu.sphinx.frontend.feature.BatchCMN"/>
<component name="featureExtraction"
type="edu.cmu.sphinx.frontend.feature.DeltasFeatureExtractor"/>
<!-- ********** -->
<!-- monitors -->
<!-- ********** -->
<component name="accuracyTracker"
type="edu.cmu.sphinx.instrumentation.AccuracyTracker">
<property name="recognizer" value="${recognizer}"/>
<property name="showAlignedResults" value="false"/>
<property name="showRawResults" value="false"/>
</component>
<component name="memoryTracker"
type="edu.cmu.sphinx.instrumentation.MemoryTracker">
<property name="recognizer" value="${recognizer}"/>
<property name="showSummary" value="false"/>
<property name="showDetails" value="false"/>
</component>
<component name="speedTracker"
type="edu.cmu.sphinx.instrumentation.SpeedTracker">
<property name="recognizer" value="${recognizer}"/>
<property name="frontend" value="${frontend}"/>
<property name="showSummary" value="true"/>
<property name="showDetails" value="false"/>
</component>
-->
-->
-->
<item>unitExitActiveList</item>
<item>wordActiveList</item>
<item>wordActiveList</item>
<item>activeList</item>
<item>activeList</item>
<item>activeList</item>
</propertylist>
</component>
<!-- *******
-->
- <!-- unitExitActiveList
-->
- <!-- *******
-->
- <component name="unitExitActiveList" type="edu.cmu.sphinx.decoder.search.PartitionActiveListFactory">
<property name="absoluteBeamWidth" value="-1" />
<property name="logMath" value="logMath" />
<property name="relativeBeamWidth" value="${relativeBeamWidth}" />
</component>
- <!-- *******
-->
- <!-- wordActiveList
-->
- <!-- *******
-->
- <component name="wordActiveList" type="edu.cmu.sphinx.decoder.search.WordActiveListFactory">
<property name="absoluteBeamWidth" value="21" />
<property name="logMath" value="logMath" />
<property name="relativeBeamWidth" value="1E-25" />
</component>
-->
-->
-->
type="edu.cmu.sphinx.instrumentation.RecognizerMonitor">
<property name="recognizer" value="recognizer"/>
<propertylist name="allocatedMonitors">
<item>configMonitor </item>
</propertylist>
</component>
<!-- create the configMonitor -->
<component name="configMonitor"
type="edu.cmu.sphinx.instrumentation.ConfigMonitor">
<property name="showConfig" value="true"/>
<!--property name="showConfigAdGDL" value="true"/-->
</component>
<!-- ********** -->
<!-- Miscellaneous components -->
<!-- ********** -->
<component name="logMath" type="edu.cmu.sphinx.util.LogMath">
<property name="logBase" value="1.0001"/>
<property name="useAddTable" value="true"/>
</component>
<component name="confidenceScorer"
type="edu.cmu.sphinx.result.MAPConfidenceScorer">
<property name="languageWeight" value="${languageWeight}"/>
</component>
<!-- ******* -->
<!-- beamFinder (seems like a good idea but causes strange warnings-->
<!-- ******* -->
</config>
Hi,
Make sure that you're audio file is encoded according to your StreamDataSource. If you want to avoid errors here, use the AudioFileDataSource which extracts the encoding automatically.
Additionally you should use a data-blocker before the speechClassifier which ensures that speech frames of defined size are use to determine speech activity.
-Holger