;;;; -*- Mode: LISP; Syntax: ANSI-Common-Lisp; Package: COMMON-LISP -*- ;;;; (C) Madhu 2003. Touched: 19-Sep-03 07:43:05 IST ;;; ITRANS input for writing out ISCII encoded characters (defpackage "ITRANS" (:use "COMMON-LISP") (:export "%PARSE-TO-ISCII")) (in-package "ITRANS") (defpackage "TOK" ;; Tokens correspond to Avinash Chopde's lexer in ITRANS-5.3 (:export #:A #:AA #:I #:II #:U #:UU #:RI #:RII #:LI #:LII #:AY #:AAY #:AI #:O #:OO #:AU #:AM #:AHA #:KA #:KHA #:GA #:GHA #:NGA #:CHA #:CHHA #:JA #:JHA #:JNH #:TTA #:TTHA #:DDA #:DDHA #:NNA #:TA #:THA #:DA #:DHA #:NA #:PA #:PHA #:BA #:BHA #:MA #:YA #:YYA #:RA #:LA #:VA #:SHA #:SHHA #:SA #:HA #:LDA #:KSHA #:GYA #:NNX #:NYA #:RRA #:KADOT #:KHADOT #:GADOT #:DDADOT #:DDHADOT #:JADOT #:PHADOT #:RA_HALF #:ANUSVARA #:CHANDRA #:CHANDRA_BN #:VIRAAM #:AVAGRAHA #:SRI #:AUM)) (defvar *itrans-catalog* '((TOK:A :names ("a") :type :vowel :iscii (164)) (TOK:AA :names ("aa" "A") :type :vowel :iscii (165) :matra (218)) (TOK:I :names ("i") :type :vowel :iscii (166) :matra (219)) (TOK:II :names ("ii" "I") :type :vowel :iscii (167) :matra (220)) (TOK:U :names ("u") :type :vowel :iscii (168) :matra (221)) (TOK:UU :names ("uu" "U") :type :vowel :iscii (169) :matra (222)) (TOK:RI :names ("R^i" "RRi") :type :vowel :iscii (170) :matra (223)) ;;?? eyeballed (TOK:RII :names ("R^I" "RRI") :type :vowel :iscii (170 233) :matra (223 233)) ;;?? eyeballed (TOK:LI :names ("L^i" "LLi") :type :vowel :iscii (166 233)) ;;?? (TOK:LII :names ("L^I" "LLI") :type :vowel) ;;?? confusion about long e ;; ISCII 171, 224 = E (Southern Scripts) ;; ISCII 172, 225 = EY ;; ISCII 173, 226 = AI (TOK:AY :names ("e") :type :vowel :iscii (172) :matra (225)) ;;?? (TOK:AAY :names ("E") :type :vowel) (TOK:AI :names ("ai") :type :vowel :iscii (173) :matra (226)) ;; Couldnt map ISCII 174, 227 = AYE (Devanagari) ;; (looks like e with a half-chandrabindu) ;; mapped for CHANDRA below ;; ;;?? confusion about long o ;; ISCII 175, 228 = O (Southern Scripts) ;; ISCII 176, 229 = OW (TOK:O :names ("o") :type :vowel :iscii (176) :matra (229)) ;;?? (TOK:OO :names ("O") :type :vowel) (TOK:AU :names ("au") :type :vowel :iscii (177) :matra (230)) ;; ;; couldnt map ISCII 178, 231 = AWE (Devanagari) ;; looks like `a' with a half-chandrabindu. ;; ;;?? prolly final M. ITRANS says its a vowel (TOK:AM :type :vowel :iscii (204 232)) ;; ;; Visarg - should be special. ITRANS says its a vowel (TOK:AHA :names ("H") :type :vowel :iscii (163)) ;; ;; Consonants (TOK:KA :names ("k") :type :consonant :iscii (179)) (TOK:KHA :names ("kh") :type :consonant :iscii (180)) (TOK:GA :names ("g") :type :consonant :iscii (181)) (TOK:GHA :names ("gh") :type :consonant :iscii (182)) (TOK:NGA :names ("N^" "~N") :type :consonant :iscii (183)) (TOK:CHA :names ("ch") :type :consonant :iscii (184)) (TOK:CHHA :names ("Ch" "chh") :type :consonant :iscii (185)) (TOK:JA :names ("j") :type :consonant :iscii (186)) (TOK:JHA :names ("jh") :type :consonant :iscii (187)) (TOK:JNH :names ("JN" "~n") :type :consonant :iscii (188)) (TOK:TTA :names ("T") :type :consonant :iscii (189)) (TOK:TTHA :names ("Th") :type :consonant :iscii (190)) (TOK:DDA :names ("D") :type :consonant :iscii (191)) (TOK:DDHA :names ("Dh") :type :consonant :iscii (192)) (TOK:NNA :names ("N") :type :consonant :iscii (193)) (TOK:TA :names ("t") :type :consonant :iscii (194)) (TOK:THA :names ("th") :type :consonant :iscii (195)) (TOK:DA :names ("d") :type :consonant :iscii (196)) (TOK:DHA :names ("dh") :type :consonant :iscii (197)) (TOK:NA :names ("n") :type :consonant :iscii (198)) (TOK:PA :names ("p") :type :consonant :iscii (200)) (TOK:PHA :names ("ph") :type :consonant :iscii (201)) (TOK:BA :names ("b") :type :consonant :iscii (202)) (TOK:BHA :names ("bh") :type :consonant :iscii (203)) (TOK:MA :names ("m") :type :consonant :iscii (204)) (TOK:YA :names ("y") :type :consonant :iscii (205)) ;;?? (TOK:YYA :names ("Y") :type :consonant :iscii (205 232 205)) (TOK:RA :names ("r") :type :consonant :iscii (207)) (TOK:LA :names ("l") :type :consonant :iscii (209)) (TOK:VA :names ("v" "w") :type :consonant :iscii (212)) (TOK:SHA :names ("sh") :type :consonant :iscii (213)) (TOK:SHHA :names ("Sh" "shh") :type :consonant :iscii (214)) (TOK:SA :names ("s") :type :consonant :iscii (215)) (TOK:HA :names ("h") :type :consonant :iscii (216)) ;; ;;?? Marathi lla? (TOK:LDA :names ("lda" "L") :type :consonant) ;; these are conjunct consonants really. ITRANS thinks they're consonants (TOK:KSHA :names ("ksh" "x") :type :consonant :iscii (179 232 214)) (TOK:GYA :names ("dny" "GY") :type :consonant :iscii (181 232 205)) ;; ;;?? tamil bengali. (TOK:NNX :names ("^n") :type :consonant) (TOK:NYA :type :consonant :iscii (198 232 205)) ;; ;;?? Marathi half RA? (TOK:RRA :names ("RRA") :type :consonant) ;; (TOK:KADOT :names ("q") :type :consonant :iscii (179 233)) (TOK:KHADOT :names ("K") :type :consonant :iscii (180 233)) (TOK:GADOT :names ("G") :type :consonant :iscii (181 233)) (TOK:DDADOT :names (".D") :type :consonant :iscii (191 233)) (TOK:DDHADOT :names (".Dh") :type :consonant :iscii (192 233)) (TOK:JADOT :names ("z" "J") :type :consonant :iscii (186 233)) (TOK:PHADOT :names ("f") :type :consonant :iscii (201 233)) ;; ;;?? Hindi half RA?? (TOK:RA_HALF :names ("^r" ".r") :type :special) ;; (TOK:ANUSVARA :names ("M" ".n") :type :special :iscii (162)) ;; ;; ?? ardhachandra - vowel sign AYE (prolly) ;; TODO: check 178, 231 (TOK:CHANDRA :names (".c") :type :special :iscii (227)) ;; ;; chandrabindu (TOK:CHANDRA_BN :type :special :iscii (161) :names (".N")) ;; ;; misnamed. the halant. (viram in ISCII is the danda of ITRANS) (TOK:VIRAAM :names (".h") :type :special :iscii (232)) ;; eyeballed (TOK:AVAGRAHA :names (".a") :type :special :iscii (234 233)) (TOK:SRI :names ("SRI") :type :special :iscii (213 232 207 220)) (TOK:AUM :names ("OM" "AUM") :type :special :iscii (161 233)) )) (loop for (token . plist) in *itrans-catalog* do (setf (symbol-plist token) plist)) #+clisp (progn (require 'clex "clex") ;; Gilbert Baumann's lexer. (Luke's jungearl copy) (provide 'clex)) (defmacro define-itrans-lexer nil `(clex:deflexer ITRANS nil ,@(loop for token being the external-symbols of :TOK for type = (get token :type) and names = (get token :names) nconc (mapcar #'(lambda (name) `(,name (return (values ,type ',token)))) names)) (#\Tab (return (values :punctuation #\Tab))) (#\Space (return (values :punctuation #\Space))) (#\Newline (return (values :punctuation #\Newline))) (#\. (return (values :punctuation #\.))) ((or "0123456789") ;; TODO: map to ISCII digits (of range 241-250) (return (values :punctuation (schar clex:bag 0)))))) (define-itrans-lexer) #+nil (pprint (with-input-from-string (stream "karmaNyevaadhikaaraste maa phaleshhu kadaachana .") (let ((lexer (make-ITRANS-lexer stream))) (loop with bag = nil do (multiple-value-bind (cat token) (funcall lexer) (when (eq cat :eof) (return bag)) (setq bag (nconc bag (list (cons token cat))))))))) (defun %process-word (tokens) "return list of bytes representing iscii for the given list of tokens" (loop for prev-token = nil then token for (token . rest) on tokens for next-token = (if rest (car rest)) for cat = (get token :type) for prev-cat = (if prev-token (get prev-token :type)) for next-cat = (if next-token (get next-token :type)) for code = (ecase cat (:vowel (if (eq prev-cat :consonant) (unless (eq token 'tok:a) (get token :matra)) (get token :iscii))) (:consonant (if (eq next-cat :vowel) (get token :iscii) (append (get token :iscii) '(232)))) (:special (get token :iscii))) appending (progn ;; unnecessary checks (assert (and token cat)) (if prev-token (assert prev-cat)) (if next-token (assert next-cat)) (unless code (assert (eq token 'tok:a))) code) into codes finally (return codes))) #+nil (pprint (let ((tokens (loop for x in '(KA A RA MA A NNA YA AY VA AA DHA I KA AA RA A SA TA AY) for y = (find-symbol (string x) :tok) if y collect y else do (error "not a tok")))) (%process-word tokens))) ;;(179 207 232 204 193 232 205 225 212 218 197 219 179 218 207 215 232 194 225) (defun %process-punctuation (char) (list (char-code char))) (defun %process-special (special &optional prev-letter) ;; nothing special (append prev-letter (list special))) #+clisp (progn (require 'lalr "lalr") ;; Mark Johnson's parser generator (Luke's copy) (provide 'lalr)) (LALR:define-grammar ITRANS-parser ;; terminals (:vowel :consonant :special :halant :punctuation) (dinput --> #'(lambda nil)) (dinput --> dinput word #'(lambda (d w) (append d (%process-word w)))) (dinput --> dinput :punctuation #'(lambda (d p) (append d (%process-punctuation p)))) (word --> oletter #'identity) (word --> word oletter #'(lambda (w o) (append w o))) (oletter --> letter #'identity) (oletter --> letter :special #'(lambda (l s) (%process-special s l))) (oletter --> :special #'(lambda (s) (%process-special s))) (letter --> :vowel #'(lambda (v) (list v))) (letter --> consonants :vowel #'(lambda (cs v) (append cs (list v)))) (letter --> consonants :halant #'(lambda (cs h) (append cs (list h)))) (letter --> consonants #'identity) (consonants --> :consonant #'(lambda (c) (list c))) (consonants --> consonants :consonant #'(lambda (cs c) (append cs (list c)))) ) (defun %parse-to-iscii (stream) "parses ITRANS character stream into consful of bytes" (let ((lexer (make-itrans-lexer stream))) (itrans-parser lexer #'(lambda nil (error 'parse-error))))) #+nil (with-open-file (stream "geeta.itx" :direction :input) (let ((slurp (itrans:%parse-to-iscii stream))) (with-open-file (tmp "tmp.isc" :direction :output :element-type '(unsigned-byte 8) :if-exists :supersede) (iscii::%write-string "
" tmp)
      (loop for byte in slurp do (write-byte byte tmp))
      (iscii::%write-string "
" tmp))) (iscii:%convert-to-html "tmp.isc" "test.html"))