;;;; -*- Mode: LISP; Syntax: ANSI-Common-Lisp; Package: COMMON-LISP -*- ;;;; (C) 2003 Madhu. Touched: <13-Sep-03 05:48:21 IST> ;;;; Viewing the IS13194-1991 charset (defpackage "ISCII" (:use "COMMON-LISP") (:export "%CONVERT-TO-HTML")) (in-package "ISCII") #+clisp (progn ;; load once (require 'trie "sequence-search-replace") ;; Mathew Danish' trie (provide 'trie)) ;;; ;;; - read in map file: maps iscii codes to (CDAC's) DV-TTYogesh fonts (defvar *map-file-path* #p"iscii_plugin/map_files/windows/Dvngri.map") (defvar *trie* (let ((trie (make-instance 'trie:TRIE)) (path *map-file-path*)) (with-open-file (stream path :direction :input :element-type '(unsigned-byte 8) :if-does-not-exist :error) (warn "~&reading ~a: (length=~d bytes)... " path (file-length stream)) (loop named waltraud for lineno from 0 for lhs = (loop for c = (read-byte stream nil) until (cond (c (= c 32)) ; #\Space (t (warn "...done (read ~a lines)." lineno) (return-from waltraud))) collect c) ; explicit consing for rhs = (loop for c = (read-byte stream) until (or (= c 10) ;; #\Newline (when (= c 13) ; #\Return. consume #\Newline (assert (= 10 (read-byte stream))) t)) collect c) do (trie:ADD-SEQUENCE trie lhs :payload rhs) )) trie)) (defvar *punctuations* (loop for c in '( #\! #\' #\` #\( #\) #\/ #\: #\; ) collect (char-code c))) (defun category (byte) "returns category of IS-13194-1991 byte. nil otherwise" (when (and byte (typep byte '(unsigned-byte 8))) (when (<= 161 byte 255) (cond ((<= byte 163) :vowel-modifier) ; chandrabindu, anuswara, visarg ((<= byte 178) :vowel) ((<= byte 217) :consonant) ; 217 = INV ((<= byte 231) :vowel-sign) ; matra ((= byte 232) :halant) ; vowel omission sign ((= byte 233) :nukta) ; diacritic ((= byte 234) :viram) ; viram ((<= 235 byte 238) :unused) ((= byte 239) :ATR) ((= byte 240) :EXT) ((<= 241 byte 250) :digit) ((<= 251 byte 255) :unused) ((= byte 256) (error "wanted unsigned byte")))))) #| ISCII-91 gave the following grammar for computing word (really akshara) or character boundaries for characters encoded in ISCII: Word ::= Syllable+ [Cons-Syllable]. Syllable ::= Cons-Vowel-Syllable | Vowel-Syllable. Vowel-Syllable ::= V[D]. Cons-Vowel-Syllable ::= [Cons-Syllable] Full-Cons [M] [D]. Cons-Syllable ::= [Pure-Cons] [Pure-Cons] Pure-Cons. Pure-Cons ::= Full-Cons H Full-Cons ::= Cons[N] C = 179-217 (Consonant) V = 164-178 (Vowel) D = 161-163 (Vowel Modifier) M = 219-231 (Matra; Vowel Sign) H = 232 (Halant; Vowel Omission Sign) N = 233 (Nukta; Diacritic) (However we use ad-hoc parsing culled from iscii_plugin.) |# (defmacro next-iscii-word () '(let* ((ch %PEEK) (cat (category ch))) (when cat %KEEP ;; ISCII (case ch ((161 166 167 170 234) ;; chandrabindu, i, ii, ri, or viram (case (category %PEEK) ((:nukta :vowel-modifier) %KEEP))) (otherwise (case (category ch) ((:vowel :vowel-sign) (if (eq (category %PEEK) :vowel-modifier) %KEEP)) (:consonant (loop with halant-flag = nil do (case (category %PEEK) (:vowel-modifier %KEEP (return)) (:vowel-sign (setq ch %PEEK) %KEEP (case (category %PEEK) (:vowel-modifier %KEEP) (:nukta (case ch ;; i, ii, or ATR ((219 220 239) %KEEP)))) (return)) (:nukta %KEEP) (:vowel (return)) (:consonant (cond (halant-flag (setf halant-flag nil) %KEEP) (t (return)))) (:halant (setf halant-flag t) %KEEP) (otherwise (return)))))) )) t))) #| Problem: The algorithm for computing word (really character or akshara) boundaries of characters encoded in iscii should be reusable regardless of the representation of the encoded data (foo-stream, sequence etc.) Solution: NEXT-ISCII-WORD was written as a "macro" that uses the following symbol macros with the indicated-alongwith semantics: - (these should be defined in the environment in which the NEXT-ISCII-WORD is expanded) %PEEK - peeks at next byte. Arrange to yield nil on EOF %KEEP - consumes peeked byte and accumulates it. |# #+nil (pprint (macroexpand-1 '(iscii::next-iscii-word ))) (defvar *iscii-p* nil) (define-condition toggled-iscii ()()) (defun read1 (input-stream output-stream) ; explicit consing version (loop with next = nil and ch = nil and *iscii-p* = nil for stack = nil do (symbol-macrolet ((%PEEK (if next next (let ((ch (read-byte input-stream nil))) (if (and (null ch) stack) (warn "%PEEK REACHED EOF. Unprocessed stack=~a" stack)) (setq next ch)))) (%KEEP (if next (progn (push next stack) (setq next nil)) (error "%KEEP: no extant %PEEK")))) (cond ((null (setq ch %PEEK)) (return :eof)) ((next-iscii-word) ;; this fills up `stack'. dump it. (let* ((target (nreverse stack)) (subst (trie:FIND-SEQUENCE *trie* target))) (unless subst (error "target=~a no subst" target subst)) (unless *iscii-p* (setq *iscii-p* t) (signal 'toggled-iscii)) (loop for c in subst do (write-byte c output-stream)))) ((or (<= 0 ch 32) (member ch *punctuations*)) ;; NOT ISCII %KEEP (write-byte ch output-stream)) (t ;; NOT ISCII BUT ASCII (when *iscii-p* (setq *iscii-p* nil) (signal 'toggled-iscii)) %KEEP (write-byte ch output-stream)))))) (defun %write-string (string stream); wither gray (loop for c across string do (write-byte (char-code c) stream) finally (finish-output stream))) (defun %convert-to-html (infile outfile) (with-open-file (i infile ;; #+nil "geeta.isc" "01-alphabet.txt" :direction :input :element-type '(unsigned-byte 8) :if-does-not-exist :error) (with-open-file (o outfile ;; "fragment.html" :direction :output :element-type '(unsigned-byte 8) :if-exists :supersede) (handler-bind ((toggled-iscii (lambda (c) (declare (ignore c)) (cond (*iscii-p* (%write-string "" o)) (t (%write-string "" o)))))) (pprint (read1 i o)))))) #+nil (iscii::%convert-to-html "geeta.isc" "test.html") #+nil (iscii::%convert-to-html "01-alphabet.txt" "test.html") #+nil (with-open-file (o "dvngri.map" :direction :output) (trie:maptrie #'(lambda (target subst) (format o "~a = ~a~&" target subst)) *trie*)) #+(and nil clisp) (progn ;; load once (require 'who #p"cl-who-0.3.0/load.lisp") ;; Edi Weitz' macro CL-WHO (provide 'who)) #+(and nil clisp) (shadowing-import '(cl-who:with-html-output ;; yes clisp exports this too. cl-who:fmt cl-who:str cl-who:htm)) #+nil (defun %printglyph (target o) ;; o = output-stream (let ((subst (trie:FIND-SEQUENCE *trie* target))) (if subst (with-html-output (o) (htm (:font :face "DV-TTYogesh" :size "+2" (loop for byte in subst do (fmt "~c" (code-char byte)))))) (warn "no subst for ~a" target)))) #+nil (with-open-file (o "dvngri.html" :direction :output :if-exists :supersede) (with-html-output (o) (trie:maptrie #'(lambda (target subst) (htm (:font :face "DV-TTYogesh" :size "+2" (loop for byte in subst do (fmt "~c" (code-char byte)))) (fmt "= ~a~&" target) (:br))) *trie*))) #+nil (with-open-file (o "test.html" :direction :output :if-exists :supersede) (%iscii::printglyph '(234 233) o))