proc lmAccu { langmod text } {
upvar $langmod l
incr l(cnt) [llength $text]
set v [lindex $text 0]
for { set x 1 } { $x < [llength $text] } { incr x } {
set w [lindex $text $x] ; set q $v,$w
if {![info exist l(1,$v)]} {set l(1,$v) 1 ; incr l(m)} else {incr l(1,$v)}
if {![info exist l(2,$q)]} {set l(2,$q) 1 ; incr l(b)} else {incr l(2,$q)}
set v $w
}
}
proc lmUpdate { langmod } {
upvar $langmod l ; set disc 0.01
foreach i [array names l] {
regexp {(.*),(.*)} $i dummy n v ; regexp {(.*),(.*),(.*)} $i dummy n v w
if {$n == 1} {set l(p,$i) [expr log(($l($i)-$disc)/$l(cnt)) /2.30259]}
if {$n == 2} {set l(p,$i) [expr log(($l($i)-$disc)/$l(1,$v))/2.30259]}
}
}
proc lmWrite { langmod filename } {
upvar $langmod l
set f [open $filename w] ; set mlist {} ; set blist {}
foreach i [array names l] {
if [regexp {p,1,(.*)} $i d v ] { lappend mlist "$v $l($i) -99.9" }
if [regexp {p,2,(.*),(.*)} $i d v w] { lappend blist "$v $w $l($i)" }
}
set mlist [lsort -ascii $mlist] ; set blist [lsort -ascii $blist]
puts $f "\\data\\\nngram 1=$l(m)\nngram 2=$l(b)\n\n\\1-grams:"
foreach m $mlist { puts $f "[lindex $m 1] [lindex $m 0] [lindex $m 2]" }
puts $f "\n\\2-grams:"
foreach b $blist { puts $f "[lindex $b 2] [lindex $b 0] [lindex $b 1]" }
puts $f "\\end\\"
close $f
}
[DBase db] open ../step1/db.dat ../step1/db.idx -mode r
set lm(cnt) 0 ; set lm(m) 1 ; set lm(b) 0 ; set lm(1,</s>) 1
foreach utt [db] {
makeArray arr [db get $utt]
set text [concat {<s>} $arr(text) {</s>}]
puts "$utt $text"
lmAccu lm $text
}
lmUpdate lm
lmWrite lm langmod
exit