Solution Homework 2 Mar-1

Task 3-1 Write a script coverage.tcl that outputs the number of words in the text file steps/data/transcripts which are not covered by the dictionary steps/data/dict.

For purposes like calculation of coverage, out-of-vocabulary words or determination of vocabulary we provide you with the script library coverageLib.tcl orignially written by Martin Westphal. Feel free to use it, extend and improve it.
(Source this in Janus %source coverageLib.tcl and use the defined procedures to compare the results to your script).

coverageLib.tcl Download here

# ========================================================================
#  JANUS-SR   Janus Speech Recognition Toolkit
#             ------------------------------------------------------------
#  Author  :  Martin Westphal
#  Module  :  coverage.tcl
#  Date    :  22.Oct.97
#
#  Remarks :  This collection of tcl procedure can be used to process
#             a text file and create a vocabulary from it or given
#             a vocabulary calculate the coverage of the text.
#             The main procedure is
#                            readCountText 
#             which reads a text file and fills the global array count.
#             There are different procedures to define the vocabulary
#             (see below)
#
#  Global  :  count()      The count for  in the given text.
#
#             vocab()      Exists and is 1 if  is in the 
#                                vocabulary.
#             wordL()     An alphabetic list of words in the text
#                                that have the same count .
# =======================================================================

# =======================================================================
#  Counting words in the text file
# =======================================================================
# -----------------------------------------------------------------------
#  read a text file and count the words
# -----------------------------------------------------------------------
proc readCountText {file {reset 1}} {
    if {[catch {set FP [open $file r]} msg]} {
	puts stderr "Couldn't open file $file for reading"
	return -1
    }
    if {$reset} { resetCount }
    set wordN 0
    while {[gets $FP line] >= 0} {
	foreach word $line { 
	    countWord $word 
	    incr wordN
	}
    }
    close $FP
    puts "read $wordN words from file $file"
    return $wordN
} 

# -----------------------------------------------------------------------
#  reset count
# -----------------------------------------------------------------------
proc resetCount {} {
    global count
    if [array exists count] {unset count}
}
# -----------------------------------------------------------------------
#  increase the counter for a given word
# -----------------------------------------------------------------------
proc countWord {word} {
    global count
    if [info exists count($word)] { incr count($word)
    } else                        { set count($word) 1 }
}

# -----------------------------------------------------------------------
#  return the number of total words
# -----------------------------------------------------------------------
proc totalWords {} {
    global count
    set totalN 0
    foreach word [array names count] {
	incr totalN $count($word)
    }
    return $totalN
}

# -----------------------------------------------------------------------
#  return the number of different words
# -----------------------------------------------------------------------
proc differentWords {} {
    global count
    return [llength [array names count]]
}

# -----------------------------------------------------------------------
#  return the number of words in the vocabulary
# -----------------------------------------------------------------------
proc totalVocab {} {
    global vocab
    return [llength [array names vocab]]
}

# =======================================================================
#  How to choose the vocabulary:
#  There are different ways to define the vocabulary.
#      1) take the whole dictionary
#      2) take all words with a certain minimal count
#      3) take the most salient words until you reach a certain coverage
#      4) take the n most salient words
#      5) take all (the first n) words of a text
#  First some "sub"-procedures are defined:
#      coverage, sortByCount
# =======================================================================
# -----------------------------------------------------------------------
#  given a vocbulary calculate the coverage
# -----------------------------------------------------------------------
proc coveredWords {} {
    global count vocab
    set cover 0    ;# how many words are covered by the vocabulary
    set known 0    ;# how many vocabulary words in the text 
    foreach word [array names vocab] {
	if {[info exists count($word)]} {
	    incr cover $count($word)
	    incr known
	}
    }
    return "$cover $known"
}
proc coverage {} {
    global count vocab
    set total [totalWords]      ;# total words in the text
    set diff  [differentWords]  ;# different words in the text
    set coverKnown [coveredWords]
    set cover [lindex $coverKnown 0]
    set known [lindex $coverKnown 1]

    set miss [expr $diff - $known]
    puts "$diff different words, $known known, $miss missing in the vocabulary"

    set coverage [expr 100.0 * $cover / $total]
    set oov      [expr 100.0 - $coverage]
    puts "$total total words, $cover ([format %.2f $coverage]%) covered, [format %.2f $oov]% OOV"
    return "$diff $total $known $cover"
}
# -----------------------------------------------------------------------
#  We create an array wordL() with a list of words with a certain
#  count.
# -----------------------------------------------------------------------
proc sortByCount {} {
    global count wordL

    if [info exists wordL] {unset wordL}
    foreach word [array names count] {
	lappend wordL($count($word)) "$word"
    }
    # --- sort the lists ---
    foreach c [array names wordL] {
	set wordL($c) [lsort $wordL($c)]
    }
    return [lsort -integer -decreasing [array names wordL]]
}
# -----------------------------------------------------------------------
#  read the vocabulary from (dictionary) file
#  (take only first word of each line)
# -----------------------------------------------------------------------
proc vocabRead {file {reset 1}} {
    global vocab

    if {[catch {set FP [open $file r]} msg]} {
	puts stderr "Couldn't open file $file for reading"
	return -1
    }
    if {$reset && [info exists vocab]} { unset vocab }
    set newN  0
    set wordN 0
    while {[gets $FP line] >= 0} {
	set word [lindex $line 0]
	incr wordN
	if ![info exists vocab($word)] {
	    set vocab($word) 1
	    incr newN
	}
    }
    close $FP
    puts "read $wordN words ($newN new) from vocab file $file"
    return [totalVocab]
} 

# -----------------------------------------------------------------------
#  add all words with a certain minimal count as vocabulary
# -----------------------------------------------------------------------
proc vocabMinCount {minCount {reset 1}} {
    global wordL vocab

    if {$reset && [info exists vocab]} { unset vocab }
    set wordN 0
    set newN  0                         ;# number of new words in the vocabulary
    set total [lindex [coveredWords] 0] ;# words covered by the vocabulary
    foreach c [sortByCount] {
	if {$c < $minCount} break
	foreach word $wordL($c) {
	    incr wordN
	    if ![info exists vocab($word)] {
		set vocab($word) 1
		incr newN
		incr total $c
	    }
	}
    }
    set coverage [format %.2f [expr 100.0 * $total / [totalWords]]]
    puts stderr "adding $wordN words ($newN new) with min count >= $minCount now cover $total (${coverage}%) words in the text"
    set vocabN   [totalVocab]
    return "$vocabN $total $coverage"
}
    
# -----------------------------------------------------------------------
#  add the n most salient words as vocabulary
# -----------------------------------------------------------------------
proc vocabSalient {number {reset 1}} {
    global wordL vocab

    if {$reset && [info exists vocab]} { unset vocab }
    set wordN 0
    set newN  0                         ;# number of new words in the vocabulary
    set total [lindex [coveredWords] 0] ;# words covered by the vocabulary
    foreach c [sortByCount] {
	if {$wordN >= $number} break
	foreach word $wordL($c) {
	    if {$wordN >= $number} break
	    incr wordN
	    if ![info exists vocab($word)] {
		set vocab($word) 1       ;# add word to the vocabulary
		incr total $c
		incr newN
	    }
	}
    }
    set coverage [format %.2f [expr 100.0 * $total / [totalWords]]]
    puts stderr "adding $wordN most salient words ($newN new) now cover $total (${coverage}%) words in the text"
    set vocabN [totalVocab]
    return "$vocabN $total $coverage"
}

# -----------------------------------------------------------------------
#  add the most salient words until you reach a certain coverage
# -----------------------------------------------------------------------
proc vocabCoverage {coverage {reset 1}} {
    global countL wordL vocab

    if {$reset && [info exists vocab]} { unset vocab }
    set totalW [totalWords]
    set coverW [expr int(ceil($totalW * $coverage / 100.0))]
    set newN 0                           ;# number of new words in the vocabulary
    set total  [lindex [coveredWords] 0] ;# words covered by the vocabulary
    foreach c $countL {
	if {$total >= $coverW} break
	foreach word $wordL($c) {
	    if {$total >= $coverW} break
	    if ![info exists vocab($word)] {
		set vocab($word) 1       ;# add word to the vocabulary
		incr total $c
		incr newN
	    }
	}
    }
    set coverage [format %.2f [expr 100.0 * $total / [totalWords]]]
    puts stderr "adding $newN most salient words now cover $total (${coverage}%) words in the text"
    set vocabN   [totalVocab]
    return "$vocabN $total $coverage"
}

# -----------------------------------------------------------------------
#  take a maximum of n words from a text file as vocabulary until 
#  you have m words in the vocabulary or a certain coverage
# -----------------------------------------------------------------------
proc vocabText {file n m coverage {reset 1}} {
    global vocab count

    if {$reset && [info exists vocab]} { unset vocab }
    if {[catch {set FP [open $file r]} msg]} {
	puts stderr "Couldn't open file $file for reading"
	return -1
    }
    set totalW [totalWords]
    set coverW [expr int(ceil($totalW * $coverage / 100.0))]
    set wordN 0                         ;# words in the text
    set total [lindex [coveredWords] 0] ;# words covered by the vocabulary
    set vocabN [totalVocab]             ;# words in the vocabulary
    set break 0
    while {[gets $FP line] >= 0} {
	foreach word $line { 
	    if {$wordN >= $n || $vocabN >= $m || $total >= $coverW} {
		set break 1; break
	    }
	    incr wordN
	    if ![info exists vocab($word)] {
		set vocab($word) 1
		incr total $count($word)
		incr vocabN
	    }
	}
	if {$break} break
    }
    close $FP
    set coverage [format %.2f [expr 100.0 * $total / $totalW]]
    puts "read $wordN words from file $file"
    return "$wordN $vocabN $total $coverage"
}

# -----------------------------------------------------------------------
#  print vocabulary size and coverage after n words of text
#  n is taken from a given sorted list
# -----------------------------------------------------------------------
proc coverageText {file nL {reset 1}} {
    global vocab count

    if {$reset && [info exists vocab]} { unset vocab }
    if {[catch {set FP [open $file r]} msg]} {
	puts stderr "Couldn't open file $file for reading"
	return -1
    }
    set totalW [totalWords]
    set wordN  0                         ;# words from the text
    set total  [lindex [coveredWords] 0] ;# words covered by the vocabulary
    set vocabN [totalVocab]              ;# words in the vocabulary
    set i  0
    set iN [llength $nL]
    set n [lindex $nL $i]
    set break 0
    while {[gets $FP line] >= 0} {
	foreach word $line { 
	    incr wordN
	    if ![info exists vocab($word)] {
		set vocab($word) 1
		incr total $count($word)
		incr vocabN
	    }
	    if {$wordN >= $n} {
		set coverage [format %.2f [expr 100.0 * $total / $totalW]]
		puts "$wordN $vocabN $total $coverage"
		incr i
		if {$i >= $iN} {set break 1; break}
		set n [lindex $nL $i]
	    }
	}
	if {$break} break
    }
    close $FP
    set coverage [format %.2f [expr 100.0 * $total / $totalW]]
    return "$wordN $vocabN $total $coverage"
    
}

Question 3-1: How high is the OOV-rate? This questions can be answered using the above given script

janus 
% source coverageLib.tcl
% set myfpin  [open /afs/cs.cmu.edu/project/nnspeech-9/janus/isl-lab/data/transcripts r]
% set myfpout [open textWithoutID w]
% while {[gets $myfpin line] > -1} {
%    set noIDline [lrange $line 1 end]
%    puts $myfpout $noIDline
% }
% close $myfpin
% close $myfpout
% readCountText textWithoutID
% read 1522 words from file textWithoutID
% 1522
% differentWords
% 778
% vocabRead /afs/cs.cmu.edu/project/nnspeech-9/janus/isl-lab/data/dict 
% read 748 words (748 new)
% coverage
% 778 different words, 747 known, 31 missing in the vocabulary
% 1522 total words, 1440 (94.61%) covered, 5.39% OOV
% 778 1522 747 1440

Question 3-2: What are the different ways to define a vocabulary for a speech recognizer?

# =======================================================================
#  How to choose the vocabulary:
#  There are different ways to define the vocabulary.
#      1) take the whole dictionary
#      2) take all words with a certain minimal count
#      3) take the most salient words until you reach a certain coverage
#      4) take the n most salient words
#      5) take all (the first n) words of a text
#  First some "sub"-procedures are defined:
#      coverage, sortByCount
# =======================================================================
(see also in script coverageLib.tcl)

Task 3-2: The dictionary Mydict The first 10 lines of the dictionary in the format Janus need it looks like the following:

{ABLE} {E Y P A L}
{ABOUT} {A P A T}
{ACTION} {E K S A N}
{ACTS} {E K T S}
{ADAMS} {E T A N S}
{ADDICTIVE} {A T I K T I V}
{ADDING} {E T I N}
{ADDRESS} {E T R E S}
{ADMINISTRATION} {E T N I N I S T R E Y S A N}
{ADVENTURE} {E T V E N T S A}
...

Task 4: The files db.dat and db.idx The following script (handed-in by Qin) gives you the database files.

#===============================================================
#  JANUS-SR   Janus Speech Recognition Toolkit
#
#---------------------------------------------------
#       Advanced Lab Speech Recognition and
#               Understanding
#
#  Author  : Qin Jin 
#  Module  : Mydb.tcl
#  Date    : Feb 23 2001
# 
#  Remarks :
#
#===============================================================
if { $argc != 1 || [lindex $argv 1] == "-help"} {
     puts stderr "USAGE: $argv0 'inputfile'"
     exit
}
set filename [lindex $argv 0];
[DBase db] open tdb.dat tdb.idx -mode rwc;
set FP [open $filename r];

while { [gets $FP line] != -1 } {
    set key [lindex $line 0];
    set utt "utt [lindex $line 0]";
    set newline "text [lrange $line 1 end]";
    db add $key [list $utt $newline];
}
db close;
close $FP;
exit;
Output for db.datis:
{utt 011c021e} {text BANCA COMMERCIALE CALLED THE FED'S REQUIREMENTS UNACCEPTABLE AND DROPPED ITS BID}
{utt 013c020j} {text HE ALSO RULED OUT THE POSSIBILITY THAT BURMAH MIGHT LAUNCH A BID ON ITS OWN}
...
Output for db.idx
       0       2    5125    5166    4961
       0       0     102       8011c021e
      41     103      97       8013c020j
       0     201     138       8014c020r

Question 6-1: Is it possible that WA < 0 or WA > 100. If so, give an example.

Yes, it can happen that WA < 0, if the number of the produced errors is higher than the number of words to be recognized.
Example: The reference is hello world but the hypothesis is tell them all.
Makes 3 errors divided by a total of 2 to be recognized words, which leads to a word error rate of 150% which means a word accuracy of 100% - 150% = -50%.

No, it can't happen that WA becomes higher than 100%, nobody is better than perfect ;-)

Question 6-2: Is it possible that for a pair of reference/hypothesis the minimal word error rate results from different types of errors (different combination of error types)?

Yes, if each error type counts the same, you can either choose the error sequence "Substitution, Insertion" or "Insertion, Substituion". Even more variants are possible, but the portion of the error types for the minimal error rate remain the same.

Task 6: The script align.tcl

align.tcl: Nice solution from German students. Download here

#=======================================================================
#  JANUS-SR   Janus Speech Recognition Toolkit
# ----------------------------------------------------------
#  Author  :  Matthias und Roald
#  Module  :  align.tcl
#  Date    :  30.10.97
#
#  Remarks :  Computes word error rate of a hypothesis against a
#             reference utterance, using DTW (Viterbi) algorithm.
#
#======================================================================

# The return value is a list of the following form:
#   {errorRate numErrors alignList errorList}
# errorList contains for every word of the hypothesis the error(s)
# detected between this word and its predecessor:
#   d# : # deletions
#   i  : insertion
#   s  : substitution
#   -  : no error
# Note that d# may be combined with s,-: d#s => # deletions, followed by a
# subst. d#- => # deletions, followed by correct word.
# Note that there may be more than one possible best alignments, but  
# only
# one of these is returned.
# Note that the reference as well as the hypothesis must contain at least
# one word!

proc wordErrorRate {reference hypothesis} {
    ;# note that START and END also belong to reference and hypothesis
    set reference [linsert $reference 0 "..START.."]
    lappend reference "..END.."
    set reflen [llength $reference]
    set index 0
    ;# indexing reference words
    foreach elem $reference {
        set refarray([expr $reflen - 1 - $index]) $elem
        incr index
    }
    ;# Initialize actual vector (diagram column)
    set hypword [lindex $hypothesis 0]
    set colnum [llength $hypothesis]
    for {set index 0} {$index < ($reflen - 1)} {incr index} {
        set value [expr $reflen - 2 - $index]
        set actvec($index) $value
        if {$value > 0} {
            set eType "d$value"
        } else {set eType ""}
        set refword $refarray($index)
        if {[string compare $refword $hypword]} {
            incr actvec($index)
            append eType s
        } else {append eType -}
        set backtrack($colnum,$index) [expr $reflen - 1]
        set errorType($colnum,$index) $eType
    }
    set index [expr $reflen - 1]
    set actvec($index) 1
    set backtrack($colnum,$index) $index
    set errorType($colnum,$index) i
    incr colnum -1
    ;# main loop: steps through hypothesis list

    foreach hypword [lreplace $hypothesis 0 0] {
        for {set index 0} {$index < $reflen} {incr index} {
            set minvalue [expr $actvec($index) + 1]
            set minindex $index
            set mineType i
            if {[string compare $refarray($index) $hypword]} {
                set offset 1
            } else {
                set offset 0
            }
            for {set j [expr $index + 1]} {$j < $reflen} {incr j} {
                set delet [expr $j - $index - 1]
                set value [expr $actvec($j) + $offset + $delet]
                if {$value < $minvalue} {
                    set minvalue $value
                    set minindex $j
                    if {$delet > 0} {set mineType d$delet} else {
                        set mineType ""
                    }
                    if {$offset == 1} {append mineType s} else {
                        append mineType -
                    }
                }
            }
            set actvec($index) $minvalue
            set backtrack($colnum,$index) $minindex  ;# backtrack. link
            set errorType($colnum,$index) $mineType
        }
        incr colnum -1
    }
    ;# last step
    set minvalue [expr $actvec(0) + 1]
    set minindex 0
    for {set j 1} {$j < $reflen} {incr j} {
        set value [expr $actvec($j) + $j - 1]
        set delet [expr $j - 1]
        if {$value < $minvalue} {
            set minvalue $value
            set minindex $j
            if {$delet > 0} {set mineType d$delet} else {
                set mineType ""
            }
        }
    }
    ;# backtracking
    set alignlist {}
    if {[string length $mineType] > 0} {
        set errorlist [list $mineType]
    } else {set errorlist {}}
    for {set colnum 1} {$colnum <= [llength $hypothesis]} {incr colnum} {
        set alignlist [linsert $alignlist 0 $refarray($minindex)]
        set mineType $errorType($colnum,$minindex)
        set errorlist [linsert $errorlist 0 $mineType]
        set minindex $backtrack($colnum,$minindex)
    }
    return [list [expr 1.00*$minvalue/($reflen-2)] $minvalue $alignlist $errorlist]
}

# Test

set reference {Guten Morgen es ist viertel vor sechs}
set hypothesis {Puten Morgen ist vierte vor sie sechs}
puts "Reference: $reference\nHypothesis: $hypothesis"
set res [wordErrorRate $reference $hypothesis]
puts "Word error rate = [lindex $res 0] ([lindex $res 1] errors)."
puts "Reached with alignment: [lindex $res 2]"
puts "Corr. error sequence: [lindex $res 3]"
exit

Last modified: Fri Mar 9 10:09:20 EST 2001
Maintainer: tanja@cs.cmu.edu.