;;; -*- Mode: LISP; Package: :cl-user; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;;; ;;; Touched: Sat Apr 14 19:12:48 2007 +0530 ;;; Time-stamp: <2007-05-06 12:54:34 madhu> ;;; Bugs-To: ;;; Status: Experimental. Do not redistribute ;;; Copyright (C) 2007 Madhu. All Rights Reserved. ;;; (defpackage "WTF-UTF-8" (:use "CL")) (in-package "WTF-UTF-8") (defun read-utf8-datum (stream) "Calls READ-BYTE on STREAM. Decodes and returns an integer." (let ((octet (read-byte stream))) (cond ((zerop (logand octet #x80)) octet) ((= #xc0 (logand octet #xe0)) (logior (ash (logand octet #x1f) 6) (logand (read-byte stream) #x3f))) ((= #xe0 (logand octet #xf0)) (logior (logior (ash (logand octet #xf) 12) (ash (logand (read-byte stream) #x3f) 6)) (logand (read-byte stream) #x3f))) ((= #xf0 (logand octet #xf8)) (logior (logior (logior (ash (logand octet #x7) 18) (ash (logand (read-byte stream) #x3f) 12)) (ash (logand (read-byte stream) #x3f) 6)) (logand (read-byte stream) #x3f))) ((= #xf8 (logand octet #xfc)) (logior (logior (logior (logior (ash (logand octet #x3) 24) (ash (logand (read-byte stream) #x3f) 18)) (ash (logand (read-byte stream) #x3f) 12)) (ash (logand (read-byte stream) #x3f) 6)) (logand (read-byte stream) #x3f))) ((= #xfc (logand octet #xfe)) (logior (logior (logior (logior (logior (ash (logand octet #x1) 32) (ash (logand (read-byte stream) #x3f) 24)) (ash (logand (read-byte stream) #x3f) 18)) (ash (logand (read-byte stream) #x3f) 12)) (ash (logand (read-byte stream) #x3f) 6)) (logand (read-byte stream) #x3f))) (t (error "Unexpected value ~X at start of UTF-8 sequence." octet))))) (defun write-utf8-datum (code stream) "Calls WRITE-BYTE on STREAM to encode the integer CODE." (cond ((< code #x80) (write-byte code stream)) ((< code #x800) (write-byte (logior #x80 (logand #x3f code)) stream) (write-byte (logior (logand #xff (ash #x7e 5)) code) stream)) ((< code #x10000) (write-byte (logior #x80 (logand #x3f code)) stream) (write-byte (logior #x80 (logand #x3f (ash code -6))) stream) (write-byte (logior (logand #xff (ash #x7e 4)) (ash code -6)) stream)) ((< code #x20000) (write-byte (logior #x80 (logand #x3f code)) stream) (write-byte (logior #x80 (logand #x3f (ash code -6))) stream) (write-byte (logior #x80 (logand #x3f (ash code -12))) stream) (write-byte (logior #x80 (logand #x3f (ash code -18))) stream) (write-byte (logior (logand #xff (ash #x7e 3)) (ash code -18)) stream)) ((< code #x4000000) (write-byte (logior #x80 (logand #x3f code)) stream) (write-byte (logior #x80 (logand #x3f (ash code -6))) stream) (write-byte (logior #x80 (logand #x3f (ash code -12))) stream) (write-byte (logior #x80 (logand #x3f (ash code -18))) stream) (write-byte (logior #x80 (logand #x3f (ash code -24))) stream) (write-byte (logior (logand #xff (ash #x7e 2)) (ash code -24)) stream)) ((< code #x4000000) (write-byte (logior #x80 (logand #x3f code)) stream) (write-byte (logior #x80 (logand #x3f (ash code -6))) stream) (write-byte (logior #x80 (logand #x3f (ash code -12))) stream) (write-byte (logior #x80 (logand #x3f (ash code -18))) stream) (write-byte (logior #x80 (logand #x3f (ash code -24))) stream) (write-byte (logior #x80 (logand #x3f (ash code -32))) stream) (write-byte (logior (logand #xff (ash #x7e 1)) (ash code -32)) stream)) (t (error "unsupported unicode datum: ~s." code)))) (defun slurp-utf8 (file &optional (array (make-array 1024 :element-type '(integer 0) :adjustable t :fill-pointer 0))) (with-open-file (stream file :element-type '(unsigned-byte 8)) (let (length) (handler-case (loop (assert (setq length (vector-push-extend (read-utf8-datum stream) array)) nil "VECTOR-PUSH-EXTEND FAILED")) (end-of-file (c) (declare (ignore c)) (assert (= length (1- (length array)) )))))) array) (defun dump-utf8 (vector filename) (with-open-file (stream filename :direction :output :if-exists :supersede :element-type '(unsigned-byte 8)) (loop for code-point across vector count 1 do (write-utf8-datum code-point stream)))) #|| (defvar $unicode-array (slurp-utf8 "/var/local/sanskrit/www.shivashakti.com/KuMa.txt")) (array-total-size $unicode-array) (time (progn (setf (fill-pointer $unicode-array) 0) (setq $unicode-array (slurp-utf8 "/var/local/sanskrit/www.shivashakti.com/KuMa.txt" $unicode-array)) (length $unicode-array))) (time (dump-utf8 $unicode-array "home:/kuma.xml")) (progn (setf (fill-pointer $unicode-array) 0) (setq $unicode-array (time (slurp-utf8 "/var/local/sanskrit/indology.info/etexts/archive/texts/veni/venisamhara.xml" $unicode-array))) (length $unicode-array)) (time (dump-utf8 $unicode-array "home:/veni.xml")) (progn (setf (fill-pointer $unicode-array) 0) (setq $unicode-array (time (slurp-utf8 "home:hello.out" $unicode-array))) (time (dump-utf8 $unicode-array "/tmp/hello"))) ||# ;; ;; UTF-16 distinguishes encoding order: ;; UTF-16LE is UTF16 in "little-endian" byte order. (order mark is 0xFF 0xFE) ;; UTF-16BE is UTF16 in "big-endian" byte order. (order mark is 0xFE 0xFF) (defun ucs-2-char-type (byte-order-mark) (ecase byte-order-mark (#xfeff 'utf-16le) (#xfffe 'utf-16be))) (defun read-utf16-datum-12 (stream) (+ (ash (read-byte stream) 8) (read-byte stream))) (defun write-utf16-datum-12 (code stream) (write-byte (logand #xff (ash code -8)) stream) (write-byte (logand #xff code) stream)) (defun read-utf16-datum-21 (stream) (+ (read-byte stream) (ash (read-byte stream) 8))) (defun write-utf16-datum-21 (code stream) (assert (<= code #xffff) nil "Illegal code for ucs-2: ~D" code) (write-byte (logand #xff code) stream) (write-byte (logand #xff (ash code -8)) stream)) ;;; ---------------------------------------------------------------------- ;;; ;;; #+nil (user:lc "home:clisp/mytrie") (defvar *unidata-file* "/scratch/madhu/extern/emacs--unicode-2/admin/unidata/UnicodeData.txt") (eval-when (load eval compile) (defvar +unidata-fields+ '(code unicode-name category combining-class bidi-class decomposition numeric-value1 numeric-value2 numeric-value3 bidi-mirrored older-name iso-comment uppercase lowercase titlecase)) (defmacro defstruct-unidata-line () `(defstruct (unidata-line (:type list) (:constructor %make-unidata-line ,+unidata-fields+)) ,@+unidata-fields+)) (defstruct-unidata-line) (defvar *unicode-names* (trie:make-trie)) (defvar *older-names* (trie:make-trie)) (defvar *categories* (trie:make-trie)) (defvar *iso-comments* (trie:make-trie)) (defvar *bidi-classes* (trie:make-trie)) (defvar *decomposition-names* (trie:make-trie)) ;; #+NIL XXX (defun pluralize (string) (let ((length (length string)) suffix-length) (labels ((ends-with (suffix) (if (<= (setq suffix-length (length suffix)) length) (string-equal string suffix :start1 (- length suffix-length)) (setq suffix-length nil))) (prev-char () (char string (- length suffix-length 1))) (fixcase (suffix) (if (upper-case-p (prev-char)) (string-upcase suffix) suffix)) (retval (suffix &optional replace) (concatenate 'string (if (and replace (< suffix-length length)) (subseq string 0 (- length suffix-length)) string) (fixcase suffix)))) (cond ((ends-with "y") (cond ((not (find (prev-char) "aeiou" :test #'char-equal)) (retval "ies" t)) (t (retval "s")))) ((ends-with "ies") nil) ((ends-with "s") (cond ((find (prev-char) "aeious" :test #'char-equal) (retval "es")) (t nil))) (t (setq suffix-length 0) (cond ((alpha-char-p (prev-char)) (retval "s")) (t nil))))))) ;;#+NIL XXX (defun trie-name-for-symbol (sym) (intern (concatenate 'string "*" (or (pluralize (symbol-name sym)) (concatenate 'string (symbol-name sym) "-NAMES")) "*"))) ;;#+NIL XXX (defmacro defvar-tries () `(progn ,@(loop for x in +unidata-fields+ collect `(defvar ,(trie-name-for-symbol x) (trie:make-trie))))) ;;#+NIL XXX (defvar-tries) (defun parse-decomposition (string) (loop for substring in (user::string-split #\Space string) for i from 0 do (assert (> (length substring) 0)) if (and (zerop i) (char= #\< (char substring 0))) do (assert (char= #\> (char substring (1- (length substring))))) and collect (trie:intern-string substring :strings-trie *decomposition-names*) else collect (parse-integer substring :radix 16))) (defun transform-unidata-arg (symbol) `(when (> (length ,symbol) 0) ;;#+NIL XXX (trie:intern-string ,symbol :strings-trie ,(trie-name-for-symbol symbol)) ,(ecase symbol (unicode-name `(trie:intern-string ,symbol :strings-trie *unicode-names*)) (older-name `(trie:intern-string ,symbol :strings-trie *older-names*)) (category `(trie:intern-string ,symbol :strings-trie *categories*)) (bidi-class `(trie:intern-string ,symbol :strings-trie *bidi-classes*)) (iso-comment `(trie:intern-string ,symbol :strings-trie *iso-comments*)) ((code) `(parse-integer ,symbol :radix 16)) ((numeric-value3 numeric-value2 numeric-value1) `(let* (*read-eval* (num (read-from-string ,symbol))) (assert (numberp num)) num)) (uppercase `(parse-integer ,symbol :radix 16)) (lowercase `(parse-integer ,symbol :radix 16)) (titlecase `(parse-integer ,symbol :radix 16)) (decomposition `(parse-decomposition ,symbol)) (bidi-mirrored `(progn (assert (= (length ,symbol) 1)) (ecase (char ,symbol 0) (#\Y t) (#\N nil)))) (combining-class `(parse-integer ,symbol))))) (defmacro defun-make-unidata-line () `(defun make-unidata-line (line) (destructuring-bind ,+unidata-fields+ (user::string-split #\; line) (%make-unidata-line ,@(loop for x in +unidata-fields+ collect (transform-unidata-arg x)))))) (defun-make-unidata-line)) (defvar *all-codes* (make-hash-table)) (defun read-unidata-file (&optional (file *unidata-file*)) (with-open-file (stream file) (loop for line = (read-line stream nil nil) for lineno from 1 while line do (let ((unidata-line (make-unidata-line line))) (multiple-value-bind (value foundp) (gethash (unidata-line-code unidata-line) *all-codes*) (assert (not foundp) nil "Duplicate code ~D on line ~D" (unidata-line-code unidata-line) lineno)) (setf (gethash (unidata-line-code unidata-line) *all-codes*) unidata-line))))) (defun make-unidata-form (list) (loop for y in list for x in +unidata-fields+ when y collect (intern (symbol-name x) :keyword) and collect y)) (defun trie-complete (trie) (mapcar (lambda (x) (coerce x 'string)) (trie:trie-completions trie))) #|| (clrhash *all-codes*) (time (read-unidata-file)) (gethash #x106 *all-codes*) (make-unidata-form (gethash #x106 *all-codes*)) (trie::trie-count *unicode-names*) (mapcar (lambda (x) (trie-name-for-symbol x)) +unidata-fields+) (loop for x in +unidata-fields+ for trie-name = (trie-name-for-symbol x) for trie = (symbol-value trie-name) do (format t "~%~% ~S~% ~S" trie-name (trie-complete trie))) (pprint (trie-complete *decompositions*)) (hash-table-count *all-codes*) ||#