LinVAM/set_kws_threshold.py

"""Script for auto tuning keyword spotting thresholds in pocketsphinx"""
from __future__ import print_function
import sys
import select
import os
import termios
import contextlib
import time
import re

import numpy as np

from pocketsphinx.pocketsphinx import *
from sphinxbase.sphinxbase import *

# keyphrases found in kwlist
WORDS = []
# test case containing multiple occurances 
# of words to be used as training audio
TEST_CASE = []
# Threshold values
FREQUENCY = []
# End frame of each word in input speech
NO_OF_FRAMES = []
# Recorded speech input
OUTPUT_FILENAME = 'testing_audio.wav'

def preprocess_files(dic_path, kwlist_path):
    """
    Function to generate required lists and call tuning functions
    """
    global WORDS, TEST_CASE, FREQUENCY

    # words found in dictinary
    _content = []
    with open(dic_path) as _f:
        _content = _f.readlines()
    _content = [x.strip() for x in _content]
    
    with open(kwlist_path) as _f:
        WORDS = _f.readlines()
    WORDS = [x.strip()[:x.strip().rfind(' ')] for x in WORDS]
    print (WORDS)

    # Loop to find out initial thresholds based on phonetics provided in dictionary
    for i, _ in enumerate(WORDS):
        # starting position of first phone for a word
        init_pos = 0
        # Count number of phones based on frequency of spaces
        spaces = 0
        # In case there is more than one word in a keyphrase, add phones for all words
        for _m in re.finditer(' ', WORDS[i]):
            indices = [j for j, s in enumerate(_content) if WORDS[i][init_pos:_m.start()]+'\t' in s]
            spaces = _content[indices[0]].count(' ') + spaces + 1
            init_pos = _m.start()+1
        indices = [j for j, s in enumerate(_content) if WORDS[i][init_pos:]+'\t' in s]
        spaces += _content[indices[0]].count(' ') + 1
        # Normalizing
        if  spaces <= 3:
            FREQUENCY.append(spaces)
        else:
            FREQUENCY.append(spaces * 2)
    # Adding random noise in test case for better tuning
    TEST_CASE = ['[RANDOM]', '[RANDOM]']
    TEST_CASE.extend(WORDS)
    TEST_CASE.extend(['[RANDOM]', '[RANDOM]'])
    TEST_CASE.extend(WORDS)
    np.random.shuffle(TEST_CASE)
    print ("HERE IS YOUR TRAINING SET")
    print (TEST_CASE)

    # record audio
    record()
    write_frequency_to_file(kwlist_path)

    # Analysis begins
    actual_tuning(dic_path, kwlist_path, 1)
    print ("Removed many false alarms. New frequency: ")
    print (FREQUENCY)
    print ('Moving on to missed detections')
    actual_tuning(dic_path, kwlist_path, 0)
    print ("Frequency tuned to the best of the script's ability. New frequency: ")
    print (FREQUENCY)
    _missed, _fa = process_threshold(kws_analysis(dic_path, kwlist_path))

def write_frequency_to_file(kwlist_path):
    """
    update modified frequencies in kwlist file
    """
    _f = open(kwlist_path, 'w')
    for i, val in enumerate(FREQUENCY):
        _f.write(WORDS[i] + ' /1e-' + str(val) + '/\n')
    _f.close()

@contextlib.contextmanager
def raw_mode(_file):
    """
    Function handle the button press on successful utterance of word by user
    """
    old_attrs = termios.tcgetattr(_file.fileno())
    new_attrs = old_attrs[:]
    new_attrs[3] = new_attrs[3] & ~(termios.ECHO | termios.ICANON)
    try:
        termios.tcsetattr(_file.fileno(), termios.TCSADRAIN, new_attrs)
        yield
    finally:
        termios.tcsetattr(_file.fileno(), termios.TCSADRAIN, old_attrs)

def record():
    """
    Records user's speech with timestamp for each spoken word
    """
    global NO_OF_FRAMES
    # rec -c 1 -r 16000 -b 16 recording.wav
    print ("-----SAY THE FOLLOWING OUT LOUD AND PRESS ENTER-----")
    print (TEST_CASE[0])
    os.system('rec -q -c 1 -r 16000 -b 16 ' + OUTPUT_FILENAME + ' &')
    NO_OF_FRAMES.append(0)
    previous = time.time()
    i = 0
    with raw_mode(sys.stdin):
        while True:
            if sys.stdin in select.select([sys.stdin], [], [], 0)[0]:
                _a = sys.stdin.read(1)
                if _a == '\n':
                    if i == len(TEST_CASE)-1:
                        current = time.time()
                        NO_OF_FRAMES.append(NO_OF_FRAMES[i] + (current - previous)*100)
                        previous = current
                        print ("STOPPING RECORDING")
                        time.sleep(2)
                        # stop Recording
                        os.system('pkill rec')
                        print (NO_OF_FRAMES)
                        break
                    else:
                        current = time.time()
                        NO_OF_FRAMES.append(NO_OF_FRAMES[i] + (current - previous)*100)
                        previous = current
                        i = i+1
                        print ("-----SAY THE FOLLOWING OUT LOUD AND PRESS ENTER-----")
                        print (TEST_CASE[i])

def actual_tuning(dic_path, kwlist_path, _z):
    """
    process false alarms and missed detections to tune thresholds
    _z in the paramter is 1 for FA analysis and 0 for missed detection analysis
    """
    # to store thresholds with minimum mismatches
    minimum_inflection = [FREQUENCY[i] for i, _ in enumerate(WORDS)]
    # to check whether a word's assessment has been finished or not
    processed = [0 for i, _ in enumerate(WORDS)]
    # get frequency of missed detections and false alarms
    _missed, _fa = process_threshold(kws_analysis(dic_path, kwlist_path))

    _least_negative_threshold = 1
    _most_negative_threshold = 49

    # Loop until there is at least one word whose assessment has not finished
    while 0 in processed:
        if _z == 1:
            # If there is a False alarm, increase threshold
            for i, val in enumerate(_fa):
                if FREQUENCY[i] > _least_negative_threshold and processed[i] == 0:
                    if val[1] > 0:
                        FREQUENCY[i] -= 2
                    else:
                        processed[i] = 1
                else:
                    processed[i] = 1
        else:
            # If there is a missed detection, decrease threshold
            for i, val in enumerate(_missed):
                if FREQUENCY[i] < _most_negative_threshold and processed[i] == 0:
                    if val[1] > 0:
                        FREQUENCY[i] += 1
                    else:
                        processed[i] = 1
                else:
                    processed[i] = 1

        write_frequency_to_file(kwlist_path)

        print ('UPDATED FREQUENCY:')
        print (FREQUENCY)

        _previous_missed = []
        _previous_missed.extend(_missed)
        _previous_fa = []
        _previous_fa.extend(_fa)
        
        _missed, _fa = process_threshold(kws_analysis(dic_path, kwlist_path))

        if _z == 1:
            # If current readings show increase in missed detections,
            # go to previous state and stop
            for i, val in enumerate(_missed):
                if val[1] > _previous_missed[i][1] and processed[i] == 0:
                    processed[i] = 1
                    FREQUENCY[i] += 2
        else:
            # If current readings show increase in false alarms,
            # go to previous state and stop
            for i, val in enumerate(_fa):
                if val[1] > _previous_fa[i][1] and processed[i] == 0:
                    processed[i] = 1
                    FREQUENCY[i] -= 1

        # If updated thresholds caused better accuracy, save them
        for i, val in enumerate([_fa, _missed][_z == 0]):
            if val[1] < [_previous_fa[i][1], _previous_missed][_z == 0]:
                minimum_inflection[i] = FREQUENCY[i]

    for i, val in enumerate([_fa, _missed][_z == 0]):
        FREQUENCY[i] = minimum_inflection[i]
    write_frequency_to_file(kwlist_path)

def kws_analysis(dic, kwlist):
    """
    kws analysis on user speech and updated threshold values
    """
    analysis_result = []

    modeldir = "/usr/local/share/pocketsphinx/model/"

    # Create a decoder with certain model
    config = Decoder.default_config()
    config.set_string('-hmm', os.path.join(modeldir, 'en-us/en-us'))
    config.set_string('-dict', dic)
    config.set_string('-kws', kwlist)
    config.set_string('-dither', "no")
    config.set_string('-logfn', '/dev/null')
    config.set_string('-featparams', os.path.join(os.path.join(modeldir, 
                                                               'en-us/en-us'), "feat.params"))

    stream = open(OUTPUT_FILENAME, "rb")

    # Process audio chunk by chunk. On keyphrase detected perform action and restart search
    decoder = Decoder(config)
    decoder.start_utt()
    timer = 0
    while True:
        buf = stream.read(1024)
        if buf:
            decoder.process_raw(buf, False, False)
        else:
            break
        if decoder.hyp() != None:
            for seg in decoder.seg():
                pass
            analysis_result.append([seg.word.rstrip(), timer/320])

            decoder.end_utt()
            decoder.start_utt()
        timer += 1024
    return analysis_result

def process_threshold(analysis_result):
    """
    calculate missed detections and false alarms
    Argument: analysis result = kws result
    """
    # stores timestamp of words which matche in both speech and kws result
    _indices = []

    missed = [[WORDS[i], 0] for i in range(len(WORDS))]
    false_alarms = [[WORDS[i], 0] for i in range(len(WORDS))]
    i = 0

    for i, val in enumerate(analysis_result):
        # Calculate the timestamp in speech closest to timestamp of word found by kws result
        _index = min(range(len(NO_OF_FRAMES)), key=lambda l: abs(NO_OF_FRAMES[l] - val[1]))
        _indices.append(_index)

        if TEST_CASE[_index-1] == '[RANDOM]':
            position_observer = WORDS.index(val[0])
            false_alarms[position_observer][1] += 1
            print ('FA Found', val[0], ' in place of RANDOM TEXT')
        elif TEST_CASE[_index-1] == val[0]:
            print ('DETECTED CORRECTLY', val[0])
        else:
            print ('FA Found', val[0], ' in place of ', TEST_CASE[_index-1])
            position_original = WORDS.index(TEST_CASE[_index-1])
            position_observer = WORDS.index(val[0])
            missed[position_original][1] += 1
            false_alarms[position_observer][1] += 1
    # If speech had timestamp not mentioned in kws result, then its detection was missed
    for i, val in enumerate(TEST_CASE):
        if i+1 not in _indices and val != '[RANDOM]':
            position_original = WORDS.index(val)
            missed[position_original][1] += 1
            print ('Missed ', val)
    return missed, false_alarms
    
if __name__ == '__main__':
    DIC_FILE = "/home/pankaj/catkin_ws/src/pocketsphinx/demo/voice_cmd.dic"
    KWLIST_FILE = "/home/pankaj/catkin_ws/src/pocketsphinx/demo/automated.kwlist"
    if len(sys.argv) == 3:
        DIC_FILE = sys.argv[1]
        KWLIST_FILE = sys.argv[2]
    preprocess_files(DIC_FILE, KWLIST_FILE)
added the initial version for voice command spotting 2019-04-17 13:52:40 +10:00			`"""Script for auto tuning keyword spotting thresholds in pocketsphinx"""`
			`from __future__ import print_function`
			`import sys`
			`import select`
			`import os`
			`import termios`
			`import contextlib`
			`import time`
			`import re`

			`import numpy as np`

			`from pocketsphinx.pocketsphinx import *`
			`from sphinxbase.sphinxbase import *`

			`# keyphrases found in kwlist`
			`WORDS = []`
			`# test case containing multiple occurances`
			`# of words to be used as training audio`
			`TEST_CASE = []`
			`# Threshold values`
			`FREQUENCY = []`
			`# End frame of each word in input speech`
			`NO_OF_FRAMES = []`
			`# Recorded speech input`
			`OUTPUT_FILENAME = 'testing_audio.wav'`

			`def preprocess_files(dic_path, kwlist_path):`
			`"""`
			`Function to generate required lists and call tuning functions`
			`"""`
			`global WORDS, TEST_CASE, FREQUENCY`

			`# words found in dictinary`
			`_content = []`
			`with open(dic_path) as _f:`
			`_content = _f.readlines()`
			`_content = [x.strip() for x in _content]`

			`with open(kwlist_path) as _f:`
			`WORDS = _f.readlines()`
			`WORDS = [x.strip()[:x.strip().rfind(' ')] for x in WORDS]`
			`print (WORDS)`

			`# Loop to find out initial thresholds based on phonetics provided in dictionary`
			`for i, _ in enumerate(WORDS):`
			`# starting position of first phone for a word`
			`init_pos = 0`
			`# Count number of phones based on frequency of spaces`
			`spaces = 0`
			`# In case there is more than one word in a keyphrase, add phones for all words`
			`for _m in re.finditer(' ', WORDS[i]):`
			`indices = [j for j, s in enumerate(_content) if WORDS[i][init_pos:_m.start()]+'\t' in s]`
			`spaces = _content[indices[0]].count(' ') + spaces + 1`
			`init_pos = _m.start()+1`
			`indices = [j for j, s in enumerate(_content) if WORDS[i][init_pos:]+'\t' in s]`
			`spaces += _content[indices[0]].count(' ') + 1`
			`# Normalizing`
			`if spaces <= 3:`
			`FREQUENCY.append(spaces)`
			`else:`
			`FREQUENCY.append(spaces * 2)`
			`# Adding random noise in test case for better tuning`
			`TEST_CASE = ['[RANDOM]', '[RANDOM]']`
			`TEST_CASE.extend(WORDS)`
			`TEST_CASE.extend(['[RANDOM]', '[RANDOM]'])`
			`TEST_CASE.extend(WORDS)`
			`np.random.shuffle(TEST_CASE)`
			`print ("HERE IS YOUR TRAINING SET")`
			`print (TEST_CASE)`

			`# record audio`
			`record()`
			`write_frequency_to_file(kwlist_path)`

			`# Analysis begins`
			`actual_tuning(dic_path, kwlist_path, 1)`
			`print ("Removed many false alarms. New frequency: ")`
			`print (FREQUENCY)`
			`print ('Moving on to missed detections')`
			`actual_tuning(dic_path, kwlist_path, 0)`
			`print ("Frequency tuned to the best of the script's ability. New frequency: ")`
			`print (FREQUENCY)`
			`_missed, _fa = process_threshold(kws_analysis(dic_path, kwlist_path))`

			`def write_frequency_to_file(kwlist_path):`
			`"""`
			`update modified frequencies in kwlist file`
			`"""`
			`_f = open(kwlist_path, 'w')`
			`for i, val in enumerate(FREQUENCY):`
			`_f.write(WORDS[i] + ' /1e-' + str(val) + '/\n')`
			`_f.close()`

			`@contextlib.contextmanager`
			`def raw_mode(_file):`
			`"""`
			`Function handle the button press on successful utterance of word by user`
			`"""`
			`old_attrs = termios.tcgetattr(_file.fileno())`
			`new_attrs = old_attrs[:]`
			`new_attrs[3] = new_attrs[3] & ~(termios.ECHO \| termios.ICANON)`
			`try:`
			`termios.tcsetattr(_file.fileno(), termios.TCSADRAIN, new_attrs)`
			`yield`
			`finally:`
			`termios.tcsetattr(_file.fileno(), termios.TCSADRAIN, old_attrs)`

			`def record():`
			`"""`
			`Records user's speech with timestamp for each spoken word`
			`"""`
			`global NO_OF_FRAMES`
			`# rec -c 1 -r 16000 -b 16 recording.wav`
			`print ("-----SAY THE FOLLOWING OUT LOUD AND PRESS ENTER-----")`
			`print (TEST_CASE[0])`
			`os.system('rec -q -c 1 -r 16000 -b 16 ' + OUTPUT_FILENAME + ' &')`
			`NO_OF_FRAMES.append(0)`
			`previous = time.time()`
			`i = 0`
			`with raw_mode(sys.stdin):`
			`while True:`
			`if sys.stdin in select.select([sys.stdin], [], [], 0)[0]:`
			`_a = sys.stdin.read(1)`
			`if _a == '\n':`
			`if i == len(TEST_CASE)-1:`
			`current = time.time()`
			`NO_OF_FRAMES.append(NO_OF_FRAMES[i] + (current - previous)*100)`
			`previous = current`
			`print ("STOPPING RECORDING")`
			`time.sleep(2)`
			`# stop Recording`
			`os.system('pkill rec')`
			`print (NO_OF_FRAMES)`
			`break`
			`else:`
			`current = time.time()`
			`NO_OF_FRAMES.append(NO_OF_FRAMES[i] + (current - previous)*100)`
			`previous = current`
			`i = i+1`
			`print ("-----SAY THE FOLLOWING OUT LOUD AND PRESS ENTER-----")`
			`print (TEST_CASE[i])`

			`def actual_tuning(dic_path, kwlist_path, _z):`
			`"""`
			`process false alarms and missed detections to tune thresholds`
			`_z in the paramter is 1 for FA analysis and 0 for missed detection analysis`
			`"""`
			`# to store thresholds with minimum mismatches`
			`minimum_inflection = [FREQUENCY[i] for i, _ in enumerate(WORDS)]`
			`# to check whether a word's assessment has been finished or not`
			`processed = [0 for i, _ in enumerate(WORDS)]`
			`# get frequency of missed detections and false alarms`
			`_missed, _fa = process_threshold(kws_analysis(dic_path, kwlist_path))`

			`_least_negative_threshold = 1`
			`_most_negative_threshold = 49`

			`# Loop until there is at least one word whose assessment has not finished`
			`while 0 in processed:`
			`if _z == 1:`
			`# If there is a False alarm, increase threshold`
			`for i, val in enumerate(_fa):`
			`if FREQUENCY[i] > _least_negative_threshold and processed[i] == 0:`
			`if val[1] > 0:`
			`FREQUENCY[i] -= 2`
			`else:`
			`processed[i] = 1`
			`else:`
			`processed[i] = 1`
			`else:`
			`# If there is a missed detection, decrease threshold`
			`for i, val in enumerate(_missed):`
			`if FREQUENCY[i] < _most_negative_threshold and processed[i] == 0:`
			`if val[1] > 0:`
			`FREQUENCY[i] += 1`
			`else:`
			`processed[i] = 1`
			`else:`
			`processed[i] = 1`

			`write_frequency_to_file(kwlist_path)`

			`print ('UPDATED FREQUENCY:')`
			`print (FREQUENCY)`

			`_previous_missed = []`
			`_previous_missed.extend(_missed)`
			`_previous_fa = []`
			`_previous_fa.extend(_fa)`

			`_missed, _fa = process_threshold(kws_analysis(dic_path, kwlist_path))`

			`if _z == 1:`
			`# If current readings show increase in missed detections,`
			`# go to previous state and stop`
			`for i, val in enumerate(_missed):`
			`if val[1] > _previous_missed[i][1] and processed[i] == 0:`
			`processed[i] = 1`
			`FREQUENCY[i] += 2`
			`else:`
			`# If current readings show increase in false alarms,`
			`# go to previous state and stop`
			`for i, val in enumerate(_fa):`
			`if val[1] > _previous_fa[i][1] and processed[i] == 0:`
			`processed[i] = 1`
			`FREQUENCY[i] -= 1`

			`# If updated thresholds caused better accuracy, save them`
			`for i, val in enumerate([_fa, _missed][_z == 0]):`
			`if val[1] < [_previous_fa[i][1], _previous_missed][_z == 0]:`
			`minimum_inflection[i] = FREQUENCY[i]`

			`for i, val in enumerate([_fa, _missed][_z == 0]):`
			`FREQUENCY[i] = minimum_inflection[i]`
			`write_frequency_to_file(kwlist_path)`

			`def kws_analysis(dic, kwlist):`
			`"""`
			`kws analysis on user speech and updated threshold values`
			`"""`
			`analysis_result = []`

			`modeldir = "/usr/local/share/pocketsphinx/model/"`

			`# Create a decoder with certain model`
			`config = Decoder.default_config()`
			`config.set_string('-hmm', os.path.join(modeldir, 'en-us/en-us'))`
			`config.set_string('-dict', dic)`
			`config.set_string('-kws', kwlist)`
			`config.set_string('-dither', "no")`
			`config.set_string('-logfn', '/dev/null')`
			`config.set_string('-featparams', os.path.join(os.path.join(modeldir,`
			`'en-us/en-us'), "feat.params"))`

			`stream = open(OUTPUT_FILENAME, "rb")`

			`# Process audio chunk by chunk. On keyphrase detected perform action and restart search`
			`decoder = Decoder(config)`
			`decoder.start_utt()`
			`timer = 0`
			`while True:`
			`buf = stream.read(1024)`
			`if buf:`
			`decoder.process_raw(buf, False, False)`
			`else:`
			`break`
			`if decoder.hyp() != None:`
			`for seg in decoder.seg():`
			`pass`
			`analysis_result.append([seg.word.rstrip(), timer/320])`

			`decoder.end_utt()`
			`decoder.start_utt()`
			`timer += 1024`
			`return analysis_result`

			`def process_threshold(analysis_result):`
			`"""`
			`calculate missed detections and false alarms`
			`Argument: analysis result = kws result`
			`"""`
			`# stores timestamp of words which matche in both speech and kws result`
			`_indices = []`

			`missed = [[WORDS[i], 0] for i in range(len(WORDS))]`
			`false_alarms = [[WORDS[i], 0] for i in range(len(WORDS))]`
			`i = 0`

			`for i, val in enumerate(analysis_result):`
			`# Calculate the timestamp in speech closest to timestamp of word found by kws result`
			`_index = min(range(len(NO_OF_FRAMES)), key=lambda l: abs(NO_OF_FRAMES[l] - val[1]))`
			`_indices.append(_index)`

			`if TEST_CASE[_index-1] == '[RANDOM]':`
			`position_observer = WORDS.index(val[0])`
			`false_alarms[position_observer][1] += 1`
			`print ('FA Found', val[0], ' in place of RANDOM TEXT')`
			`elif TEST_CASE[_index-1] == val[0]:`
			`print ('DETECTED CORRECTLY', val[0])`
			`else:`
			`print ('FA Found', val[0], ' in place of ', TEST_CASE[_index-1])`
			`position_original = WORDS.index(TEST_CASE[_index-1])`
			`position_observer = WORDS.index(val[0])`
			`missed[position_original][1] += 1`
			`false_alarms[position_observer][1] += 1`
			`# If speech had timestamp not mentioned in kws result, then its detection was missed`
			`for i, val in enumerate(TEST_CASE):`
			`if i+1 not in _indices and val != '[RANDOM]':`
			`position_original = WORDS.index(val)`
			`missed[position_original][1] += 1`
			`print ('Missed ', val)`
			`return missed, false_alarms`

			`if __name__ == '__main__':`
			`DIC_FILE = "/home/pankaj/catkin_ws/src/pocketsphinx/demo/voice_cmd.dic"`
			`KWLIST_FILE = "/home/pankaj/catkin_ws/src/pocketsphinx/demo/automated.kwlist"`
			`if len(sys.argv) == 3:`
			`DIC_FILE = sys.argv[1]`
			`KWLIST_FILE = sys.argv[2]`
			`preprocess_files(DIC_FILE, KWLIST_FILE)`