Task 3-1 Write a script coverage.tcl that outputs the number of words in the text file steps/data/transcripts which are not covered by the dictionary steps/data/dict.
For purposes like calculation of coverage, out-of-vocabulary words or determination of vocabulary we provide you with the script library coverageLib.tcl orignially written by Martin Westphal. Feel free to use it, extend and improve it.
(Source this in Janus %source coverageLib.tcl and use the defined procedures to compare the results to your script).
coverageLib.tcl Download here
# ======================================================================== # JANUS-SR Janus Speech Recognition Toolkit # ------------------------------------------------------------ # Author : Martin Westphal # Module : coverage.tcl # Date : 22.Oct.97 # # Remarks : This collection of tcl procedure can be used to process # a text file and create a vocabulary from it or given # a vocabulary calculate the coverage of the text. # The main procedure is # readCountText# which reads a text file and fills the global array count. # There are different procedures to define the vocabulary # (see below) # # Global : count( ) The count for in the given text. # # vocab( ) Exists and is 1 if is in the # vocabulary. # wordL( ) An alphabetic list of words in the text # that have the same count . # ======================================================================= # ======================================================================= # Counting words in the text file # ======================================================================= # ----------------------------------------------------------------------- # read a text file and count the words # ----------------------------------------------------------------------- proc readCountText {file {reset 1}} { if {[catch {set FP [open $file r]} msg]} { puts stderr "Couldn't open file $file for reading" return -1 } if {$reset} { resetCount } set wordN 0 while {[gets $FP line] >= 0} { foreach word $line { countWord $word incr wordN } } close $FP puts "read $wordN words from file $file" return $wordN } # ----------------------------------------------------------------------- # reset count # ----------------------------------------------------------------------- proc resetCount {} { global count if [array exists count] {unset count} } # ----------------------------------------------------------------------- # increase the counter for a given word # ----------------------------------------------------------------------- proc countWord {word} { global count if [info exists count($word)] { incr count($word) } else { set count($word) 1 } } # ----------------------------------------------------------------------- # return the number of total words # ----------------------------------------------------------------------- proc totalWords {} { global count set totalN 0 foreach word [array names count] { incr totalN $count($word) } return $totalN } # ----------------------------------------------------------------------- # return the number of different words # ----------------------------------------------------------------------- proc differentWords {} { global count return [llength [array names count]] } # ----------------------------------------------------------------------- # return the number of words in the vocabulary # ----------------------------------------------------------------------- proc totalVocab {} { global vocab return [llength [array names vocab]] } # ======================================================================= # How to choose the vocabulary: # There are different ways to define the vocabulary. # 1) take the whole dictionary # 2) take all words with a certain minimal count # 3) take the most salient words until you reach a certain coverage # 4) take the n most salient words # 5) take all (the first n) words of a text # First some "sub"-procedures are defined: # coverage, sortByCount # ======================================================================= # ----------------------------------------------------------------------- # given a vocbulary calculate the coverage # ----------------------------------------------------------------------- proc coveredWords {} { global count vocab set cover 0 ;# how many words are covered by the vocabulary set known 0 ;# how many vocabulary words in the text foreach word [array names vocab] { if {[info exists count($word)]} { incr cover $count($word) incr known } } return "$cover $known" } proc coverage {} { global count vocab set total [totalWords] ;# total words in the text set diff [differentWords] ;# different words in the text set coverKnown [coveredWords] set cover [lindex $coverKnown 0] set known [lindex $coverKnown 1] set miss [expr $diff - $known] puts "$diff different words, $known known, $miss missing in the vocabulary" set coverage [expr 100.0 * $cover / $total] set oov [expr 100.0 - $coverage] puts "$total total words, $cover ([format %.2f $coverage]%) covered, [format %.2f $oov]% OOV" return "$diff $total $known $cover" } # ----------------------------------------------------------------------- # We create an array wordL( ) with a list of words with a certain # count. # ----------------------------------------------------------------------- proc sortByCount {} { global count wordL if [info exists wordL] {unset wordL} foreach word [array names count] { lappend wordL($count($word)) "$word" } # --- sort the lists --- foreach c [array names wordL] { set wordL($c) [lsort $wordL($c)] } return [lsort -integer -decreasing [array names wordL]] } # ----------------------------------------------------------------------- # read the vocabulary from (dictionary) file # (take only first word of each line) # ----------------------------------------------------------------------- proc vocabRead {file {reset 1}} { global vocab if {[catch {set FP [open $file r]} msg]} { puts stderr "Couldn't open file $file for reading" return -1 } if {$reset && [info exists vocab]} { unset vocab } set newN 0 set wordN 0 while {[gets $FP line] >= 0} { set word [lindex $line 0] incr wordN if ![info exists vocab($word)] { set vocab($word) 1 incr newN } } close $FP puts "read $wordN words ($newN new) from vocab file $file" return [totalVocab] } # ----------------------------------------------------------------------- # add all words with a certain minimal count as vocabulary # ----------------------------------------------------------------------- proc vocabMinCount {minCount {reset 1}} { global wordL vocab if {$reset && [info exists vocab]} { unset vocab } set wordN 0 set newN 0 ;# number of new words in the vocabulary set total [lindex [coveredWords] 0] ;# words covered by the vocabulary foreach c [sortByCount] { if {$c < $minCount} break foreach word $wordL($c) { incr wordN if ![info exists vocab($word)] { set vocab($word) 1 incr newN incr total $c } } } set coverage [format %.2f [expr 100.0 * $total / [totalWords]]] puts stderr "adding $wordN words ($newN new) with min count >= $minCount now cover $total (${coverage}%) words in the text" set vocabN [totalVocab] return "$vocabN $total $coverage" } # ----------------------------------------------------------------------- # add the n most salient words as vocabulary # ----------------------------------------------------------------------- proc vocabSalient {number {reset 1}} { global wordL vocab if {$reset && [info exists vocab]} { unset vocab } set wordN 0 set newN 0 ;# number of new words in the vocabulary set total [lindex [coveredWords] 0] ;# words covered by the vocabulary foreach c [sortByCount] { if {$wordN >= $number} break foreach word $wordL($c) { if {$wordN >= $number} break incr wordN if ![info exists vocab($word)] { set vocab($word) 1 ;# add word to the vocabulary incr total $c incr newN } } } set coverage [format %.2f [expr 100.0 * $total / [totalWords]]] puts stderr "adding $wordN most salient words ($newN new) now cover $total (${coverage}%) words in the text" set vocabN [totalVocab] return "$vocabN $total $coverage" } # ----------------------------------------------------------------------- # add the most salient words until you reach a certain coverage # ----------------------------------------------------------------------- proc vocabCoverage {coverage {reset 1}} { global countL wordL vocab if {$reset && [info exists vocab]} { unset vocab } set totalW [totalWords] set coverW [expr int(ceil($totalW * $coverage / 100.0))] set newN 0 ;# number of new words in the vocabulary set total [lindex [coveredWords] 0] ;# words covered by the vocabulary foreach c $countL { if {$total >= $coverW} break foreach word $wordL($c) { if {$total >= $coverW} break if ![info exists vocab($word)] { set vocab($word) 1 ;# add word to the vocabulary incr total $c incr newN } } } set coverage [format %.2f [expr 100.0 * $total / [totalWords]]] puts stderr "adding $newN most salient words now cover $total (${coverage}%) words in the text" set vocabN [totalVocab] return "$vocabN $total $coverage" } # ----------------------------------------------------------------------- # take a maximum of n words from a text file as vocabulary until # you have m words in the vocabulary or a certain coverage # ----------------------------------------------------------------------- proc vocabText {file n m coverage {reset 1}} { global vocab count if {$reset && [info exists vocab]} { unset vocab } if {[catch {set FP [open $file r]} msg]} { puts stderr "Couldn't open file $file for reading" return -1 } set totalW [totalWords] set coverW [expr int(ceil($totalW * $coverage / 100.0))] set wordN 0 ;# words in the text set total [lindex [coveredWords] 0] ;# words covered by the vocabulary set vocabN [totalVocab] ;# words in the vocabulary set break 0 while {[gets $FP line] >= 0} { foreach word $line { if {$wordN >= $n || $vocabN >= $m || $total >= $coverW} { set break 1; break } incr wordN if ![info exists vocab($word)] { set vocab($word) 1 incr total $count($word) incr vocabN } } if {$break} break } close $FP set coverage [format %.2f [expr 100.0 * $total / $totalW]] puts "read $wordN words from file $file" return "$wordN $vocabN $total $coverage" } # ----------------------------------------------------------------------- # print vocabulary size and coverage after n words of text # n is taken from a given sorted list # ----------------------------------------------------------------------- proc coverageText {file nL {reset 1}} { global vocab count if {$reset && [info exists vocab]} { unset vocab } if {[catch {set FP [open $file r]} msg]} { puts stderr "Couldn't open file $file for reading" return -1 } set totalW [totalWords] set wordN 0 ;# words from the text set total [lindex [coveredWords] 0] ;# words covered by the vocabulary set vocabN [totalVocab] ;# words in the vocabulary set i 0 set iN [llength $nL] set n [lindex $nL $i] set break 0 while {[gets $FP line] >= 0} { foreach word $line { incr wordN if ![info exists vocab($word)] { set vocab($word) 1 incr total $count($word) incr vocabN } if {$wordN >= $n} { set coverage [format %.2f [expr 100.0 * $total / $totalW]] puts "$wordN $vocabN $total $coverage" incr i if {$i >= $iN} {set break 1; break} set n [lindex $nL $i] } } if {$break} break } close $FP set coverage [format %.2f [expr 100.0 * $total / $totalW]] return "$wordN $vocabN $total $coverage" }
Question 3-1: How high is the OOV-rate? This questions can be answered using the above given script
janus % source coverageLib.tcl % set myfpin [open /afs/cs.cmu.edu/project/nnspeech-9/janus/isl-lab/data/transcripts r] % set myfpout [open textWithoutID w] % while {[gets $myfpin line] > -1} { % set noIDline [lrange $line 1 end] % puts $myfpout $noIDline % } % close $myfpin % close $myfpout % readCountText textWithoutID % read 1522 words from file textWithoutID % 1522 % differentWords % 778 % vocabRead /afs/cs.cmu.edu/project/nnspeech-9/janus/isl-lab/data/dict % read 748 words (748 new) % coverage % 778 different words, 747 known, 31 missing in the vocabulary % 1522 total words, 1440 (94.61%) covered, 5.39% OOV % 778 1522 747 1440
Question 3-2: What are the different ways to define a vocabulary for a speech recognizer?
# ======================================================================= # How to choose the vocabulary: # There are different ways to define the vocabulary. # 1) take the whole dictionary # 2) take all words with a certain minimal count # 3) take the most salient words until you reach a certain coverage # 4) take the n most salient words # 5) take all (the first n) words of a text # First some "sub"-procedures are defined: # coverage, sortByCount # =======================================================================(see also in script coverageLib.tcl)
Task 3-2: The dictionary Mydict The first 10 lines of the dictionary in the format Janus need it looks like the following:
{ABLE} {E Y P A L} {ABOUT} {A P A T} {ACTION} {E K S A N} {ACTS} {E K T S} {ADAMS} {E T A N S} {ADDICTIVE} {A T I K T I V} {ADDING} {E T I N} {ADDRESS} {E T R E S} {ADMINISTRATION} {E T N I N I S T R E Y S A N} {ADVENTURE} {E T V E N T S A} ...
Task 4: The files db.dat and db.idx The following script (handed-in by Qin) gives you the database files.
#=============================================================== # JANUS-SR Janus Speech Recognition Toolkit # #--------------------------------------------------- # Advanced Lab Speech Recognition and # Understanding # # Author : Qin Jin # Module : Mydb.tcl # Date : Feb 23 2001 # # Remarks : # #=============================================================== if { $argc != 1 || [lindex $argv 1] == "-help"} { puts stderr "USAGE: $argv0 'inputfile'" exit } set filename [lindex $argv 0]; [DBase db] open tdb.dat tdb.idx -mode rwc; set FP [open $filename r]; while { [gets $FP line] != -1 } { set key [lindex $line 0]; set utt "utt [lindex $line 0]"; set newline "text [lrange $line 1 end]"; db add $key [list $utt $newline]; } db close; close $FP; exit;Output for db.datis:
{utt 011c021e} {text BANCA COMMERCIALE CALLED THE FED'S REQUIREMENTS UNACCEPTABLE AND DROPPED ITS BID} {utt 013c020j} {text HE ALSO RULED OUT THE POSSIBILITY THAT BURMAH MIGHT LAUNCH A BID ON ITS OWN} ...Output for db.idx
0 2 5125 5166 4961 0 0 102 8011c021e 41 103 97 8013c020j 0 201 138 8014c020r
Question 6-1: Is it possible that WA < 0 or WA > 100. If so, give an example.
Yes, it can happen that WA < 0, if the number of the produced errors is higher than the number of words to be recognized.
Example: The reference is hello world but the hypothesis is tell them all.
Makes 3 errors divided by a total of 2 to be recognized words, which leads to a word error rate of 150% which means a word accuracy of 100% - 150% = -50%.
No, it can't happen that WA becomes higher than 100%, nobody is better than perfect ;-)
Question 6-2: Is it possible that for a pair of reference/hypothesis the minimal word error rate results from different types of errors (different combination of error types)?
Yes, if each error type counts the same, you can either choose the error sequence "Substitution, Insertion" or "Insertion, Substituion". Even more variants are possible, but the portion of the error types for the minimal error rate remain the same.
Task 6: The script align.tcl
align.tcl: Nice solution from German students. Download here
#======================================================================= # JANUS-SR Janus Speech Recognition Toolkit # ---------------------------------------------------------- # Author : Matthias und Roald # Module : align.tcl # Date : 30.10.97 # # Remarks : Computes word error rate of a hypothesis against a # reference utterance, using DTW (Viterbi) algorithm. # #====================================================================== # The return value is a list of the following form: # {errorRate numErrors alignList errorList} # errorList contains for every word of the hypothesis the error(s) # detected between this word and its predecessor: # d# : # deletions # i : insertion # s : substitution # - : no error # Note that d# may be combined with s,-: d#s => # deletions, followed by a # subst. d#- => # deletions, followed by correct word. # Note that there may be more than one possible best alignments, but # only # one of these is returned. # Note that the reference as well as the hypothesis must contain at least # one word! proc wordErrorRate {reference hypothesis} { ;# note that START and END also belong to reference and hypothesis set reference [linsert $reference 0 "..START.."] lappend reference "..END.." set reflen [llength $reference] set index 0 ;# indexing reference words foreach elem $reference { set refarray([expr $reflen - 1 - $index]) $elem incr index } ;# Initialize actual vector (diagram column) set hypword [lindex $hypothesis 0] set colnum [llength $hypothesis] for {set index 0} {$index < ($reflen - 1)} {incr index} { set value [expr $reflen - 2 - $index] set actvec($index) $value if {$value > 0} { set eType "d$value" } else {set eType ""} set refword $refarray($index) if {[string compare $refword $hypword]} { incr actvec($index) append eType s } else {append eType -} set backtrack($colnum,$index) [expr $reflen - 1] set errorType($colnum,$index) $eType } set index [expr $reflen - 1] set actvec($index) 1 set backtrack($colnum,$index) $index set errorType($colnum,$index) i incr colnum -1 ;# main loop: steps through hypothesis list foreach hypword [lreplace $hypothesis 0 0] { for {set index 0} {$index < $reflen} {incr index} { set minvalue [expr $actvec($index) + 1] set minindex $index set mineType i if {[string compare $refarray($index) $hypword]} { set offset 1 } else { set offset 0 } for {set j [expr $index + 1]} {$j < $reflen} {incr j} { set delet [expr $j - $index - 1] set value [expr $actvec($j) + $offset + $delet] if {$value < $minvalue} { set minvalue $value set minindex $j if {$delet > 0} {set mineType d$delet} else { set mineType "" } if {$offset == 1} {append mineType s} else { append mineType - } } } set actvec($index) $minvalue set backtrack($colnum,$index) $minindex ;# backtrack. link set errorType($colnum,$index) $mineType } incr colnum -1 } ;# last step set minvalue [expr $actvec(0) + 1] set minindex 0 for {set j 1} {$j < $reflen} {incr j} { set value [expr $actvec($j) + $j - 1] set delet [expr $j - 1] if {$value < $minvalue} { set minvalue $value set minindex $j if {$delet > 0} {set mineType d$delet} else { set mineType "" } } } ;# backtracking set alignlist {} if {[string length $mineType] > 0} { set errorlist [list $mineType] } else {set errorlist {}} for {set colnum 1} {$colnum <= [llength $hypothesis]} {incr colnum} { set alignlist [linsert $alignlist 0 $refarray($minindex)] set mineType $errorType($colnum,$minindex) set errorlist [linsert $errorlist 0 $mineType] set minindex $backtrack($colnum,$minindex) } return [list [expr 1.00*$minvalue/($reflen-2)] $minvalue $alignlist $errorlist] } # Test set reference {Guten Morgen es ist viertel vor sechs} set hypothesis {Puten Morgen ist vierte vor sie sechs} puts "Reference: $reference\nHypothesis: $hypothesis" set res [wordErrorRate $reference $hypothesis] puts "Word error rate = [lindex $res 0] ([lindex $res 1] errors)." puts "Reached with alignment: [lindex $res 2]" puts "Corr. error sequence: [lindex $res 3]" exit
Last modified: Fri Mar 9 10:09:20 EST 2001
Maintainer: tanja@cs.cmu.edu.