;;; -*- Mode: LISP; Package: :cl-user; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;;; ;;; HASHTRIE (Knuth's Hashed Trie) [Last Updated:] SMadhu 15th Jun 95 ;;; ;;; Touched: Wed Aug 16 19:59:31 2006 +0530 ;;; Time-stamp: <06/09/16 19:03:42 madhu> ;;; Bugs-To: ;;; Status: Experimental. Do not redistribute. ;;; Copyright (C) 2006 Madhu. All Rights Reserved. ;;; ;;; Based on Knuth's hash trie [C++ version: SMadhu 15th Jun 95] ;;; (defpackage "SIMPLE-HASH-TRIE" (:use "CL")) (in-package "SIMPLE-HASH-TRIE") (defclass simple-hash-trie () ((sz :initarg :alphabet-size) ; alphabet in [1..sz] (trie-sz :initarg :trie-size) ; max number of Pointers (link :type array) (ch :type array) (count :type array) (header :type array) #+golden-alloca (next-h) ; last free header location (empty-slot)) (:documentation " The Hash Trie represents a set of words and all prefixes of those words. For the sake of generality `words' consist of the unsigned ints which are `Char's in the range 1 - SZ. Each word `w' is represented by a Pointer p which indexes 3 large arrays:- LINK - an array of `header locations' CH - an array of child `Char's COUNT - an array of the number of children HEADER is an array indexed by header location, of Pointers. A single letter word will consist of a `Char' in the range [1..SZ]. Such words are represented by the Pointers 1 - SZ. Longer words are defined recursively:- If p represents a word w then p's family is the set of words that have w as a prefix. p's family has a header location at h = LINK[p] and HEADER[h] = p. The children of p are located at h, h+1 ,..., h+COUNT[p]-1; where COUNT[p] is the number of children in p's family. If c is a single `Char' in the range [1..SZ] and q represents the child wc of p, then CH[q] = c. q = LINK[p] + X for some X in [0..COUNT[p]-1]. ")) (defmethod clear-hash-trie ((obj simple-hash-trie)) (with-slots (sz trie-sz header link sibling ch count empty-slot next-h) obj #+golden-alloca (setq next-h (1+ sz)) (setf (aref ch 0) :header (aref link 0) 0 (aref count 0) 0 (aref header 0) 0) (loop for i from 1 to sz do (setf (aref ch i) i (aref link i) 0 (aref count i) 0 (aref header i) 0)) (loop for i from (1+ sz) to trie-sz do (setf (aref ch i) empty-slot (aref count i) 0 (aref link i) 0 (aref header i) 0)))) (defmethod initialize-instance :after ((obj simple-hash-trie) &key &allow-other-keys) (unless (slot-boundp obj 'sz) (error "Required arguments missing: alphabet-size")) (unless (slot-boundp obj 'trie-sz) (error "Required arguments missing: trie-size")) (with-slots (sz trie-sz header link ch count empty-slot) obj (setq header (make-array (1+ trie-sz)) empty-slot 0 link (make-array (1+ trie-sz)) ch (make-array (1+ trie-sz)) count (make-array (1+ trie-sz))) (clear-hash-trie obj))) (defun %find-free-header-location (simple-hash-trie c) "Return h : sz < h <= trie-sz - c, a free header location, followed by c free slots. Return NIL if no such location is available. If compile time feature golden-alloca is not present use the naive but compact version." (declare (type simple-hash-trie simple-hash-trie) (type (integer 0) c)) (with-slots (sz trie-sz ch empty-slot) simple-hash-trie (flet ((check-free (h c) (and (= (aref ch h) empty-slot) (loop for q from (1+ h) repeat c unless (= (aref ch q) empty-slot) return nil finally (return t))))) #-golden-alloca (loop for h from (1+ sz) to (- trie-sz c) if (check-free h c) return h) #+golden-alloca (if (and (check-free next-h c)) next-h (let* ((width (- trie-sz sz c)) (alpha (truncate (* width 0.61803)))) ;; alpha and width are relatively prime. (let* ((x (- next-h sz 1)) (next-x (if (< (+ x alpha) width) (+ x alpha) (+ x alpha (- width))))) (assert (<= 0 x)) ; (assert (< x width)) (assert (<= 0 next-x)) (assert (< next-x width)) (let ((start (+ next-x sz 1)) (end (- trie-sz c))) (assert (and (< sz start) (<= start end))) (let ((ret (or (loop for h from start to end if (check-free h c) return h) (loop for h from (1+ sz) below start if (check-free h c) return h)))) (when ret (prog1 (setq next-h ret) (assert (< sz next-h)) (assert (<= next-h (- trie-sz c))))))))))))) (defun insert (simple-hash-trie p c &aux z) "P is a Pointer representing word w. C is a a single `Char' in the range [1..SZ]. Return a Pointer q which represents the word wc. Return NIL on failure." (declare (type simple-hash-trie simple-hash-trie) (type (integer 1) p c)) (with-slots (CH COUNT EMPTY-SLOT HEADER LINK) simple-hash-trie (cond ((zerop (aref link p)) ;; ;; p has no family. Insert the firstborn child c of p at ;; some header location (h = link[p]) ;; (let ((h (%find-free-header-location simple-hash-trie 0))) (when h (setf (aref link p) h (aref header h) p (aref count p) 1 (aref ch h) c) h))) ;; ;; If p has child c at some location in its family, we are ;; done. Otherwise we'll have to insert child c in p's ;; family. ;; ((setq z (loop initially (assert (> (aref count p) 0)) for qprime from (aref link p) repeat (aref count p) if (= (aref ch qprime) c) return qprime finally (return nil))) z) ;; ;; Check to see if there is room in p's family to ;; accomodate child c. ;; ((/= (aref ch (+ (aref link p) (aref count p))) empty-slot) ;; ;; There is no room, find an appropriate header location ;; to move p's family so that c will fit and move p's ;; family there. ;; (let ((h (%find-free-header-location simple-hash-trie (aref count p)))) (when h (let ((r (aref link p))) ;; move p's family from r to h (loop for i from 0 repeat (aref count p) do (setf (aref ch (+ h i)) (aref ch (+ r i))) (setf (aref ch (+ r i)) empty-slot) (setf (aref link (+ h i)) (aref link (+ r i))) (let ((l (aref header (aref link (+ r i))))) (unless (zerop l) (setf (aref header (aref link (+ r i))) (+ h i)))) (setf (aref link (+ r i)) 0) (setf (aref count (+ h i)) (aref count (+ r i))) (setf (aref count (+ r i)) 0)) (setf (aref link p) h) (setf (aref header h) p (aref header r) 0) (let ((q (+ h (aref count p)))) ;; add c to p's family (setf (aref ch q) c) (incf (aref count p)) q))))) ;; ;; Insert c in p's family ;; (t (let ((h (aref link p))) (let ((q (+ h (aref count p)))) (setf (aref ch q) c) (incf (aref count p)) q)))))) (defun lookup (simple-hash-trie q) "Q is a Pointer to some word w in the trie. Return the sequence of `Char's in the range [1..SZ] comprising w." (declare (type simple-hash-trie simple-hash-trie) (type (integer 1) q)) (with-slots (ch header empty-slot link sz) simple-hash-trie (let (ret) (unless (= (aref ch q) empty-slot) (push (aref ch q) ret) (loop (cond ((<= q sz) (return ret)) (t (setq q (loop for x downfrom q for pprime = (aref header x) unless (zerop pprime) return pprime)) (push (aref ch q) ret)))))))) (define-condition trie-full-condition () ((trie :initarg :trie :reader trie-full-condition-trie) (prefix :initarg :prefix :reader trie-full-condition-prefix) (missed-word :initarg :missed-word :reader trie-full-condition-missed-word)) (:report (lambda (c stream) (format stream "~A: Failed to install ~A." (trie-full-condition-prefix c) (trie-full-condition-missed-word c))))) (defun install (simple-hash-trie sequence) (reduce (lambda (p c) (or (insert simple-hash-trie p c) (error 'trie-full-condition :trie simple-hash-trie :prefix p :missed-word sequence))) sequence)) #|| (setq $x (make-instance 'simple-hash-trie :alphabet-size 26 :trie-size 1000)) (%find-free-header-location $x 1) (setq $link (slot-value $x 'link) $count (slot-value $x 'count) $ch (slot-value $x 'ch) $header (slot-value $x 'header) $x $x) (map 'list 'ctoi "valiant") ; (22 1 12 9 1 14 20) (trace insert1 insert2 insert0) (insert $x 22 1) ; 27 (insert $x 27 12) ; 28 (insert $x 28 9) ; 2 (insert $x 29 1) ; 30 (insert $x 30 14) ; 31 (insert $x 31 20 ) ; 32 (lookup $x 32) (install $x (map 'list 'ctoi "valiant")) (install $x (map 'list 'ctoi "valient")) ; 38 (lookup $x 38) (progn (setq $dict3 (let (*read-eval*) (with-open-file (f "/tmp/dict3.lisp") (read f)))) nil) (length $dict3) (clear-hash-trie $x) (loop for i below 500 do (install $x (aref $dict3 i))) (map 'string 'itoc #(1 2 19 20 18 21 19 5)) (lookup-seq $x #(1 2 19 20 18 21 19 5)) (setq $c (make-instance 'simple-hash-trie :alphabet-size 26 :trie-size 200000) $ch (slot-value $c 'ch) $count (slot-value $c 'count) $header (slot-value $c 'header) $link (slot-value $c 'link) $dummy nil) (loop for i from 0 for x across $dict3 do (install $c (aref $dict3 i))) (install $c (map 'list 'ctoi "valiently")) (lookup $c (map 'list 'ctoi "abandoned")) ||# (defmethod ctoi ((c character)) "Internal. Convert CL character C to an element of the alphabet [1..26]." (assert (alpha-char-p c) nil "Need [A-Za-z].") (let ((code (- (char-code (char-downcase c)) 97 -1))) (assert (<= 1 code 27) nil "Need ASCII.") code)) (defmethod itoc ((i integer)) "Internal Convert I an element of the alphabet [1..26] to a CL character." (assert (<= 1 i 27) nil "Out of bounds") (let ((char (code-char (+ i 97 -1)))) (assert (alpha-char-p char) nil "Need ASCII.") char))