This first solution of the string edit distance problem
follows directly from the mathematical definition.
It can be seen that it involves ternary recursion and is therefore
exponentially slow in terms of the length of the
input strings.
let rec
length = lambda L. if null L then 0 else 1 + length tl L,
min = lambda x. lambda y. if x < y then x else y,
A =
'A'::'C'::'G'::'T'::nil
,B=
'A'::'G'::'C'::'T'::nil
in let rec
Distance = lambda A. lambda B.
if null A then length B
else if null B then length A
else
let As = tl A, Bs = tl B
in if hd A = hd B then Distance As Bs
else 1 + min (Distance As Bs)
(min (Distance As B)
(Distance A Bs))
in Distance A B
{\fB Edit Distance, \fP}
{\fB best case (A=B) O(|A|), worst case exponential. \fP}
The next version avoids doing repeated work
by storing results in an "array"
(actually list of lists) - the well-known
dynamic programming algorithm (DPA).
This reduces the time complexity to O(|A|*|B|)
where the two strings are A and B.
let rec
count = lambda L. lambda B.
if null B then nil
else (1 + hd L) :: count tl L tl B,
last = lambda L. if null tl L then hd L else last tl L,
min = lambda x. lambda y. if x < y then x else y,
A = 'a'::'c'::'g'::'t'::'a'::'c'::
'g'::'t'::'a'::'c'::'g'::'t'::nil {e.g.}
,B = 'a'::'g'::'c'::'t'::'a'::'c'::
't'::'a'::'c'::'t'::'g'::'t'::nil {e.g.}
in let
Distance = lambda A. lambda B.
let rec
Rows = (0 :: count hd Rows B) {the first row }
:: EachRow A hd Rows {the other rows},
EachRow = lambda A. lambda lastrow.
if null A then nil
else
let rec
Ach = hd A,
DoRow = lambda B. lambda NW. lambda W. {NW N}
if null B then nil {W .}
else
let N = tl NW
in let me = if Ach = hd B then hd NW
else 1 + min W (min hd N hd NW)
in me :: DoRow tl B tl NW me,
thisrow = (1 + hd lastrow)
:: DoRow B lastrow hd thisrow
in thisrow :: EachRow tl A thisrow
in last (last Rows)
in Distance A B
{\fB Edit Distance, O(|A|*|B|) time and space. \fP}
The final edit distance program
reduces the time complexity of O(n*D(A,B)) where the strings
are of length ~n, and D(A,B) is the edit distance of A and B.
This program is fast if the strings are similar
in which case the edit distance is small.
It relies on lazy evaluation or `call by need' to get this speed up.
For a full explanation, see:
L. Allison. Lazy dynamic programming can be eager.
Information Processing Letters 43 p207-212,
Sept. 1992
[HTML]
let rec
min = lambda x. lambda y. if x < y then x else y,
length = lambda L. if null L then 0 else 1+length tl L,
last = lambda L. if null tl L then hd L else last tl L,
index = lambda n. lambda L.
if n=1 then hd L else index (n-1) tl L,
acgt = lambda n.
if n > 0 then 'a'::'c'::'g'::'t'::(acgt (n-4)) else nil,
mutate = lambda L. lambda mutn.
let rec
n = length L,
step = if mutn=0 then 2*n+1 else n/mutn,
ch = lambda L. lambda st. lambda mtype.
if null L then nil
else if st = 0 then
if mtype=1 or mtype=3 then {2:1:1}
'x'::(ch tl L step (mtype+1)) {change}
else if mtype=2 then (ch tl L step 3) {delete}
else 'y'::(ch L step 1) {insert}
else (hd L)::(ch tl L (st-1) mtype) {copy}
in ch L (step/2) 1,
A = acgt 100 {e.g.}
,B = mutate A 4 {e.g.}
in let
Distance = lambda A. lambda B.
let rec
MainDiag = OneDiag A B hd Uppers (-1 :: hd Lowers),
Uppers = EachDiag A B (MainDiag::Uppers), {upper diags}
Lowers = EachDiag B A (MainDiag::Lowers), {lower diags}
OneDiag = lambda A. lambda B.
lambda diagAbove. lambda diagBelow.
let rec
DoDiag= lambda A. lambda B. lambda NW. lambda N. lambda W.
if null A or null B then nil
else { NW N }
let me = if hd A = hd B then NW { W me }
{fast} else 1+if hd W < NW then hd W else min hd N NW
{slow} { else 1+min NW (min hd N hd W) }
in me::DoDiag tl A tl B me tl N tl W, {along diag}
{hope these ^^^^ ^^^^not evaluated}
thisdiag = (1+hd diagBelow)
:: DoDiag A B hd thisdiag diagAbove tl diagBelow
in thisdiag,
EachDiag = lambda A. lambda B. lambda Diags.
if null B then nil
else (OneDiag A tl B hd tl tl Diags hd Diags) {one diag &}
:: EachDiag A tl B tl Diags {the others}
in let LAB = (length A) - (length B)
in last if LAB=0 then MainDiag
else if LAB > 0 then index LAB Lowers
else {LAB < 0} index (-LAB) Uppers
in Distance A B
{\fB Edit-Distance, diagonal orientation. \fP
For the record, the last algorithm in Lazy ML
See:
L. Allison.
Lazy dynamic programming can be eager.
Information Processing Letters 43 p207-212, Sept. 1992
[HTML].
(The algorithm is also available in
[Haskell-98].)
let
takeDNA ('>'.title) =
let rec
skipline 1 ('\n'.dna) = getDNA dna ||
skipline N ('\n'.dna) = skipline (N-1) dna ||
skipline N (a.b) = skipline N b
and
getDNA (Ch.dna)&(mem Ch ['\n'; ' ']) = getDNA dna ||
getDNA (Base.dna)&
(mem Base ['a';'c';'g';'t';'A';'C';'G';'T'])
= let Bases,rest = getDNA dna
in (Base.Bases),rest ||
getDNA x = [],x
in skipline 2 title
and
D A B =
let rec
MainDiag = OneDiag A B (hd Uppers) ( -1 . (hd Lowers))
and
Uppers = EachDiag A B (MainDiag.Uppers)
and
Lowers = EachDiag B A (MainDiag.Lowers)
and
OneDiag A B diagAbove diagBelow =
let rec
DoDiag [] B NW N W = [] ||
DoDiag A [] NW N W = [] ||
DoDiag (A.As) (B.Bs) NW N W =
let me = if A=B then NW
else 1+min3 (hd W) NW (hd N)
in me.(DoDiag As Bs me (tl N) (tl W))
and
firstelt = 1+(hd diagBelow)
and
thisdiag =
firstelt.(DoDiag A B firstelt diagAbove (tl diagBelow))
in thisdiag
and
min3 X Y Z =
-- min X (min Y Z) -- makes it O(|A|*|B|)
if X < Y then X else min Y Z -- makes it O(|A|*D(A,B))
and
EachDiag A [] Diags = [] ||
EachDiag A (B.Bs) (LastDiag.Diags) =
let NextDiag = hd(tl Diags)
in (OneDiag A Bs NextDiag LastDiag).(EachDiag A Bs Diags)
and
LAB = (length A)-(length B)
in last( if LAB = 0 then MainDiag
else if LAB > 0 then select LAB Lowers
else select (-LAB) Uppers )
in let rec
L = choplist takeDNA input
and A = hd L
and B = hd(tl L)
in "D A[" @ (itos(length A))
@ "] B[" @ (itos(length B)) @ "] = "
@ (itos(
D A B
))
@ "\n"
-- O(|A|*D(A,B)) Edit Distance.