/* ---------------------------------------------------------------- */ /* SOUNDEX.REX */ /* Encode a word (typically a surname) */ /* 09 Jan 1999 Rex Swain, Independent Consultant, www.rexswain.com */ /* ---------------------------------------------------------------- */ /* From Donald E. Knuth's "The Art of Computer Programming", Volume */ /* 3 (Addison-Wesley, 1973, ISBN 0-201-03803-X), pages 391-392. */ /* Attributed to Margaret K. Odell and Robert C. Russell (cf. U.S. */ /* Patents 1261167 (1918), 1435663 (1922)). */ /* ---------------------------------------------------------------- */ /* The logic is: */ /* */ /* 1. Retain the first letter of the name, and drop all occurrences */ /* of a,e,h,i,o,u,w,y in other positions. */ /* */ /* 2. Assign the following digits to the remaining letters after */ /* the first: */ /* */ /* b,f,p,v --> 1 l --> 4 */ /* c,g,j,k,q,s,x,z --> 2 m,n --> 5 */ /* d,t --> 3 r --> 6 */ /* */ /* 3. If two or more letters with the same code were adjacent in */ /* the original name (before step 1), omit all but the first. */ /* */ /* 4. Convert to the form "letter, digit, digit, digit" by adding */ /* trailing zeros (if there are less than 3 digits), or by */ /* dropping rightmost digits (if there are more than three). */ /* ---------------------------------------------------------------- */ /* Beware: Some other implementations of this algorithm do not */ /* handle certain cases correctly. If in doubt, try the following */ /* specific examples given by Knuth: */ /* Euler = E460 */ /* Gauss = G200 */ /* Hilbert = H416 */ /* Knuth = K530 */ /* Lloyd = L300 (many versions fail on this one) */ /* Lukasiewicz = L222 */ /* ---------------------------------------------------------------- */ /* Handling of non-alpha characters (not covered by Knuth): All */ /* non-alpha characters are removed from the argument. If no */ /* characters remain, ' ' (four spaces) is returned. Otherwise */ /* processing continues. So, for example, 'R2 D2' and '@R.-D.' are */ /* both treated as 'RD', returning 'R300'. */ /* ---------------------------------------------------------------- */ parse upper arg n /* Name to encode */ a = 'AEHIOUWYBFPVCGJKQSXZDTLMNR' /* Alphas map to... */ d = '00000000111122222222334556' /* Digits */ /* ----- Squeeze non-alpha characters ----------------------------- */ n = space(n,0) /* Remove all spaces */ i = verify(n,a) /* First non-alpha */ do while i \== 0 /* While bad character */ n = delstr(n,i,1) /* Delete bad character */ i = verify(n,a,,i) /* Find next bad char */ end if n == '' then /* No alpha characters? */ return ' ' /* Return 4 spaces */ /* ----- Map letters to digits ------------------------------------ */ f = left(n,1) /* Keep first letter */ n = translate(n,d,a) /* Translate all to digits */ /* ----- Squeeze duplicate digits --------------------------------- */ i = 1 /* Start at first char */ do while i < length(n) j = i + 1 /* Index of next char */ if substr(n,i,1) == substr(n,j,1) then /* Same as next char? */ n = delstr(n,i,1) /* Delete it */ else i = j /* Advance to next char */ end /* ----- Compose result ------------------------------------------- */ n = substr(n,2) /* Drop first digit */ n = translate(n,' ','0') /* Change zeros to blanks */ n = space(n,0) /* Remove those blanks */ return f || left(n,3,'0') /* Return letter and 3 digits */