{-----------------------------------------------------------------------------} { This is a crude prototype, no warranty, use at own risk !!! } { note remarks about storage management in the published paper. } { L. Allison, Dept. Computer Science, Monash University } { August 1992 } { If you use this code or the ideas in it cite: } { L. Allison. } { A Fast Algorithm for the Optimal Alignment of Three Strings. } { J. Theor. Biol. 164(2) pp261-269 1993 } { http://dx.doi.org/10.1006/jtbi.1993.1153 } { Covered by the GNU General Public License Version 2, June 1991 } { see http://www.gnu.org/copyleft/gpl.html } { and http://www.gnu.org/ } { Procedure finds an optimal alignment of three strings under } { tree-costs where each insertion, deletion or change costs one. } { Later, also see: } { D. R. Powell, L. Allison and T. I. Dix. } { Fast, Optimal Alignment of Three Sequences Using Linear Gap Cost. } { J. Theor. Biol. 207(3) pp325-336 Dec 2000 } { http://dx.doi.org/10.1006/jtbi.2000.2177 } procedure UkkonenR( var As:string; Alen:integer; var Bs:string; Blen:integer; var Cs:string; Clen:integer; var cost :integer); {NB. type string = array[1..MaxLen] of char } const MaxCost = 70; infinity = 32000; {!-)} type DiagRange = -MaxCost .. MaxCost; DiagName = record ab, ac :DiagRange end; var U :array[ DiagRange, DiagRange, 0..MaxCost] of integer; {-memo array} { U[ab,ac,cost] = max a s.t. D(a,b,c) <= cost where ab=a-b & ac=a-c } { = -infinity if no such a exists } { so U[ab,ac,cost] >= U[ab,ac,cost-1] } { & U[ab,ac,cost] >= U[ab,ac,cost-2]+1 } Udone :array [DiagRange, DiagRange] of DiagRange; { Udone[ab,ac] = max cost for which U[ab,ac,cost] has yet been computed.} TheDiag :DiagName; ab, ac :DiagRange; a, i :integer; ncode :array [1..7] of integer; {neighbour codes} {statistics} P1Counter,P2Counter,P3Counter, VolCounter, DiagCounter, RevCounter :integer; procedure ShowU; {print U[,,] matrix for diagnostic purposes or whatever} label 77, 88; const limit = 9; var c, ab, ac :integer; begin for c := 0 to cost do begin for ab := max(-limit, -c) to min(limit, c) do begin for ac := -limit to limit do if c <= Udone[ab,ac] then goto 77; {print row} goto 88 {row blank}; 77: write(ab :3, ':', ' ' :1+limit-ab); for ac := -limit to limit do if c <= Udone[ab, ac] then {calculated} if U[ab, ac, c] >= -99 then write(U[ab, ac, c] :3) else write('-oo') else write(' ..'); writeln; 88: {} end; writeln end end {ShowU}; function triple( {string indices} a, b, c :integer ):integer; { xxx -> 0; xxy, xx- etc -> 1; xyz, xy- etc -> 2 } var ach, bch, cch :char; Ans :integer; begin if (a < 1) or (a > Alen) then ach:=chr(0) else ach:=As[a]; if (b < 1) or (b > Blen) then bch:=chr(0) else bch:=Bs[b]; if (c < 1) or (c > Clen) then cch:=chr(0) else cch:=Cs[c]; if ach=bch then if ach=cch then Ans:=0 else Ans:=1 else {ach <> bch} if ach=cch then Ans:=1 else {ach <> bch & ach <> cch} if bch=cch then Ans:=1 else {all different} Ans:=2; triple:=Ans; end {triple}; procedure ShowTriple( a, b, c :integer ); procedure S(c:char; l:integer); begin if l > 0 then write(c, '[', l:2, '] ') else write(' ':6) end; procedure L(var s:string; l:integer); begin if l > 0 then write(s[l], ' ') else write('-', ' ') end; begin if (a < 1) or (a > Alen) then a:=0; if (b < 1) or (b > Blen) then b:=0; if (c < 1) or (c > Clen) then c:=0; S('A', a); S('B', b); S('C', c); write('| '); L(As, a); L(Bs, b); L(Cs, c); writeln end {ShowTriple}; function Ukk(ab, ac, cost :integer):integer; forward; {Ukk and Step are mutually recursive} function Step( PBest :integer; {biggest step seen so far} {diagonal}ab,ac, cost, {direction}da,db,dc :integer; var prevcost :integer) :integer; { step off the "end" of D[]diagonal in direction } { for a total edit-distance of at most cost+1. } var a,b,c, a2,b2,c2, a3 :integer; begin {Step} a := Ukk(ab, ac, cost){recursive}; b := a-ab; c := a-ac; prevcost := cost; a2 := a+da; b2 := b+db; c2 := c+dc; case da+db+dc {# non-null chars} of 1: {eg. x--, -x-, --x cost 1} {skip}; 2: {eg. xx-, x-x, -xx cost 1, or xy-, x-y, -xy cost 2} begin a3 := Ukk(ab, ac, cost-1) + da; while (a2 >= PBest) {this step might be a winner} and (a2 > a3) {a3 is the "bottom line"} and (a2 >= 0) and (b2 >= 0) and (c2 >= 0) and (triple(a2*da, b2*db, c2*dc)=2) {xy-, x-y, -xy} do begin a2:=a2-1; b2:=b2-1; c2:=c2-1; {step back along D-diagonal} {until we find xx-, x-x or -xx cost 1 or until it is not worth it} {statistics} P3Counter := P3Counter + 1 end; if a2=a3 then prevcost := cost-1 end; 3: {eg. xxy, xyx, yxx cost 1, or xyz cost 2; same D-diagonal, da=db=dc=1} {U[ab,ac,c] >= U[ab,ac,c-1] & U[ab,ac,c] >= U[ab,ac,c-2]+1} if triple(a2, b2, c2) = 2 then begin a2 := a; a3 := Ukk(ab, ac, cost-1) + 1; if a3 > a2 then begin a2:=a3; prevcost := cost-1 end end end{case}; Step := a2 end {Step}; function Ukk { (ab, ac, cost :integer):integer; forward-ed } ; var radius, nn,neighbour :integer; a, a2, b, c, da, db, dc, prevcost, pc :integer; begin {Ukk} Assertion((abs(ab) <= MaxCost)and(abs(ac) <= MaxCost)and(cost <= MaxCost), 'D too big '); radius := max7(0, ab, ac, -ab, -ac, ab-ac, ac-ab); if radius > cost then {boundary cond'n} Ukk := -infinity else if cost <= Udone[ab,ac] then {previously computed, so ...} Ukk := U[ab, ac, cost] {... look value up} else {radius <= cost and cost > Udone[ab,ac], so compute it} begin a := -infinity; prevcost := -infinity; {statistics} P1Counter := P1Counter + 1; {statistics} if Udone[ab,ac] < 0 then DiagCounter:=DiagCounter+1; for nn := 1 to 7 do {#7 is , plus 6 other diagonals} begin neighbour := ncode[nn]; da := neighbour div 4; db := neighbour div 2 mod 2; dc := neighbour mod 2; a2 := Step(a, ab-da+db,ac-da+dc, cost-1, da,db,dc, pc){recursive}; if a2 > a then begin a:=a2; {statistics} if (pc=cost-2) and (neighbour in [3,5,6]) then {statistics} RevCounter := RevCounter + 1 end end; b:=a-ab; c:=a-ac; while (a{?=-infinity?} >= 0) and (a < Alen) and (triple(a+1,b+1,c+1)=0) do begin a:=a+1; b:=b+1; c:=c+1; {statistics} P2Counter:=P2Counter+1 end; {statistics} if Udone[ab,ac] > 0 then VolCounter:=VolCounter+a-U[ab,ac,cost-1] {statistics} else VolCounter:=VolCounter+1+max(0,min3(a,b,c)); U[ab, ac, cost] := a; Udone[ab, ac] := cost; {memorise} Ukk := a end{else} end {Ukk}; procedure TraceBack( ab, ac, cost :integer ); {finds and prints an actual alignment from U[]} var a, a2, b, c, da, db, dc, ab2, ac2, prevcost, pc, choice, nn, neighbour, i :integer; begin {TraceBack} Assertion( (cost >= 0) and (cost <= Udone[ab,ac]), 'TraceBack '); a := -infinity; choice := 0; for nn := 1 to 7 do begin neighbour := ncode[nn]; da := neighbour div 4; db := neighbour div 2 mod 2; dc := neighbour mod 2; a2 := Step(a, ab-da+db,ac-da+dc, cost-1, da,db,dc, pc);{eval'd!} if (a2 > {bias to diag} a) or ((a2 >= a)and(choice in [1,2,4])and(neighbour in [3,5,6])) then begin a := a2; choice := neighbour; prevcost := pc end end; a := max(0, a{?-infinity?}); b := a-ab; c := a-ac; Assertion( choice <> 0, 'TraceBack '); da := choice div 4; db := choice div 2 mod 2; dc := choice mod 2; ab2 := ab-da+db; ac2 := ac-da+dc; {the jump-off D[] diagonal} if (a > 1) or (b > 1) or (c > 1) then TraceBack(ab2, ac2, prevcost); if a+b+c > 0 then {just to stop the As[0] Bs[0] Cs[0] | - - - "triple"} begin write(cost:3, ' '); ShowTriple(a*da, b*db, c*dc) end; for i := 1 to Ukk(ab, ac, cost) - a do begin write('" ':4); ShowTriple(a+i, b+i, c+i) end end {TraceBack}; begin {UkkonenR} ncode[1] := 7; {neighbour codes} ncode[2] := 1; ncode[3] := 2; ncode[4] := 4; ncode[5] := 3; ncode[6] := 5; ncode[7] := 6; {statistics} P1Counter:=0;P2Counter:=0; P3Counter:=0; DiagCounter := 1; RevCounter:=0; for ab := -MaxCost to MaxCost do for ac := -MaxCost to MaxCost do Udone[ab,ac] := -3; {nothing yet known, except ...} i:=1; {find initial matching run if any} while (i <= Alen) and (triple(i,i,i)=0) do {start <0,0> D[] diagonal} begin i:=i+1; {statistics} P2Counter := P2Counter + 1 end; {i >= 1 and (i > Alen or triple(i,i,i) > 0) } U[0,0,0] := i-1; Udone[0,0] := 0; {statistics} VolCounter := i; with TheDiag do begin ab:=Alen-Blen; ac:=Alen-Clen; Assertion( (abs(ab) <= MaxCost)and(abs(ac) <= MaxCost), 'len'' diff ') end; with TheDiag do cost := max7(0,ab,ac,-ab,-ac,ab-ac,ac-ab) - 1; repeat cost := cost + 1; with TheDiag do a := Ukk(ab, ac, cost) until a >= Alen; if Alen <= 50 then with TheDiag do TraceBack(ab, ac, cost); writeln('Ukkonen-Rec'' Cost=', cost :1, ' Parts=', P1Counter:1, '[', P1Counter/(1+cost*sqr(cost)/3) :4:2, '],', P2Counter:1, ',', P3Counter:1, ' total[', (P1Counter+P2Counter+P3Counter) / ((2*cost*sqr(cost)+Alen+Blen+Clen)/3) :4:2, ']', ' RC=', RevCounter:1); writeln(' ':12, ' #D[]Diag=', DiagCounter:1, ' D[]Vol=', VolCounter :1, ' ~O(', sqr(cost)*Alen :1, ')', ' v. prod_len''s=', Alen*Blen*Clen :1 ); if (Alen <= 50) and (cost <= 20) then ShowU {print U[,,] for interest} end {UkkonenR}; {-----------------------------------------------------------------------------}