proc lmAccu { langmod text } {
  upvar $langmod l
  incr l(cnt) [llength $text]
  set v [lindex $text 0]
  for { set x 1 } { $x < [llength $text] } { incr x } {
    set w [lindex $text $x] ; set q $v,$w
    if {![info exist l(1,$v)]} {set l(1,$v) 1 ; incr l(m)} else {incr l(1,$v)}
    if {![info exist l(2,$q)]} {set l(2,$q) 1 ; incr l(b)} else {incr l(2,$q)}
    set v $w
  }
}

proc lmUpdate { langmod } {
  upvar $langmod l ; set disc 0.01
  foreach i [array names l] {
    regexp {(.*),(.*)} $i dummy n v ; regexp {(.*),(.*),(.*)} $i dummy n v w
    if {$n == 1} {set l(p,$i) [expr log(($l($i)-$disc)/$l(cnt)) /2.30259]}
    if {$n == 2} {set l(p,$i) [expr log(($l($i)-$disc)/$l(1,$v))/2.30259]}
  }
}

proc lmWrite { langmod filename } {
  upvar $langmod l
  set f [open $filename w] ; set mlist {} ; set blist {}

  foreach i [array names l] {
    if [regexp {p,1,(.*)}      $i d v  ] { lappend mlist "$v    $l($i) -99.9" }
    if [regexp {p,2,(.*),(.*)} $i d v w] { lappend blist "$v $w $l($i)" }
  }
  set mlist [lsort -ascii $mlist] ; set blist [lsort -ascii $blist]
  puts $f "\\data\\\nngram 1=$l(m)\nngram 2=$l(b)\n\n\\1-grams:"
  foreach m $mlist { puts $f "[lindex $m 1] [lindex $m 0] [lindex $m 2]" } 
  puts $f "\n\\2-grams:"
  foreach b $blist { puts $f "[lindex $b 2] [lindex $b 0] [lindex $b 1]" } 
  puts $f "\\end\\"
  close $f 
}

[DBase db] open ../step1/db.dat ../step1/db.idx -mode r

set lm(cnt) 0 ; set lm(m) 1 ; set lm(b) 0 ; set lm(1,</s>) 1

foreach utt [db] {
  makeArray arr [db get $utt]
  set text [concat {<s>} $arr(text) {</s>}]
  puts "$utt $text"
  lmAccu lm $text
}

lmUpdate lm
lmWrite  lm langmod

exit