;;;; -*- Mode: Lisp -*- ;;;; Sequence search/replace functions ;;;; Version: 1.3 ;;;; Author: Matthew Danish -- mrd.debian.org ;;;; ;;;; Copyright 2002 Matthew Danish. ;;;; All rights reserved. ;;;; ;;;; Redistribution and use in source and binary forms, with or without ;;;; modification, are permitted provided that the following conditions ;;;; are met: ;;;; 1. Redistributions of source code must retain the above copyright ;;;; notice, this list of conditions and the following disclaimer. ;;;; 2. Redistributions in binary form must reproduce the above copyright ;;;; notice, this list of conditions and the following disclaimer in the ;;;; documentation and/or other materials provided with the distribution. ;;;; 3. Neither the name of the author nor the names of its contributors ;;;; may be used to endorse or promote products derived from this software ;;;; without specific prior written permission. ;;;; ;;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ;;;; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;;;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ;;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE ;;;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ;;;; OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ;;;; HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ;;;; LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ;;;; OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ;;;; SUCH DAMAGE. ;;;; ChangeLog: ;;;; 1.3 ;;;; Added some additional helpful methods to MULTIPLE-STRING-REPLACE ;;;; Exported MULTIPLE-STRING-REPLACE, ;;;; GET-ALL-SEQUENCES, ;;;; MAKE-TRIE-FROM-SEQUENCES, ;;;; MAKE-TRIE-FROM-SUBSTITUTIONS ;;;; ;;;; 1.2 ;;;; Fixed bug in MAKE-SEQUENCE-PREFIX-SEARCHER ;;;; - Failed to reset trie after invocation of closed-over function ;;;; ;;;; 1.1 ;;;; Added/exported MAKE-SEQUENCE-PREFIX-SEARCHER ;;;; ;;;; 1.0 ;;;; Initial version ;;;; - MAKE-STRING-REPLACER ;;;; TODO: ;;;; More functions! ;;;; Better trie ;;;; Ensure that it operates on any sort of sequence ;;;; (including possibly an extensible-sequence type?) (defpackage #:org.mapcar.sequence-search/replace (:use #:common-lisp) (:nicknames #:sequence-search/replace #:trie ;; madhu #:sequence-s/r) (:export #:make-string-replacer #:make-sequence-prefix-searcher ;; #:trie #:add-sequence #:find-sequence ;; really {put,get}trie #:maptrie ;; #:multiple-string-replace #:get-all-sequences #:make-trie-from-substitutions #:make-trie-from-sequences)) (in-package #:org.mapcar.sequence-search/replace) ;;; madhu (eval-when (:compile-toplevel :load-toplevel :execute) (defclass trie-node () ((object :accessor object :initarg :object :initform nil) (children :accessor children :initform nil) (payload :accessor payload :initarg :payload :initform nil) (test :accessor test :initarg :test :initform #'eql))) (defclass trie () ((root-node :accessor root-node :initarg :root-node :initform (make-instance 'trie-node :object 'root)) (current-node :accessor current-node :initarg :current-node :initform nil) (test :accessor test :initarg :test :initform #'eql))) (defclass backtracking-buffer () ((input-stream :initarg :input-stream) (buffer :initform nil) (buffer-pointer :initform nil) (buffer-end :initform nil)));;;) ;;; trie and trie-node (defmethod leaf-node-p ((node trie-node)) (null (children node))) (defmethod add-child ((node trie-node) (child trie-node)) (let ((entry (assoc (object child) (children node) :test (test node)))) (if entry (setf (cdr entry) child) (push (cons (object child) child) (children node))))) (defmethod get-child ((node trie-node) object &key (test #'eql) (key #'identity)) (cdr (assoc object (children node) :test test :key key))) (defmethod map-trie-children (fn (node trie-node)) (loop for (object . child) in (children node) collect (funcall fn child))) (defmethod get-child ((trie trie) object &key (test #'eql) (key #'identity)) (get-child (current-node trie) object :test test :key key)) (defmethod initialize-instance :after ((trie trie) &rest initargs) (declare (ignore initargs)) (setf (current-node trie) (root-node trie))) (defmethod add-child ((trie trie) (object trie-node)) (with-slots (current-node) trie (when current-node (add-child current-node object)))) (defmethod add-child ((trie trie) object) (add-child trie (make-instance 'trie-node :object object :test (test trie)))) (defmacro with-sequence-iterator ((fn sequence) &body body) (let ((index (gensym "I")) (list (gensym "L")) (length (gensym "LEN"))) `(let ((,index 0) (,list (when (listp ,sequence) ,sequence)) (,length (when (vectorp ,sequence) (length ,sequence)))) (flet ((,fn () (etypecase ,sequence (list (unless (null ,list) (multiple-value-prog1 (values t ,index (car ,list)) (incf ,index) (setf ,list (cdr ,list))))) (vector (unless (>= ,index ,length) (multiple-value-prog1 (values t ,index (aref ,sequence ,index)) (incf ,index))))))) ,@body)))) (defmethod add-sequence ((trie trie) sequence &key (payload nil)) (let ((orig-node (current-node trie))) (with-sequence-iterator (next sequence) (loop with prev = orig-node with child = nil for (more index x) = (multiple-value-list (next)) while more ;;; for child = (descend trie x) do ;; madhu (setq child (descend trie x)) (unless child (setf child (typecase x (trie-node x) (t (make-instance 'trie-node :object x)))) (add-child trie child) (descend trie x)) do (setf prev child) finally (setf (payload child) payload))) (setf (current-node trie) orig-node))) (defmethod leaf-node-p ((trie trie)) (leaf-node-p (current-node trie))) (defmethod descend ((trie trie) (object trie-node)) (descend trie (object object))) (defmethod descend ((trie trie) object) (let ((child (get-child trie object))) (when child (setf (current-node trie) child)))) (defmethod reset ((trie trie)) (setf (current-node trie) (root-node trie))) (defmethod payload ((trie trie)) (payload (current-node trie))) (defmethod object ((trie trie)) (object (current-node trie))) ;;; backtracking-buffer (defgeneric fill-buffer (t &optional t1) (:documentation "Append objects from the input-stream onto the buffer, returning the number of objects that are handled, or NIL if none could be")) (defgeneric get-next-object (t) (:documentation "Return next object in buffer, as determined by the buffer-pointer, reading from stream if necessary")) (defgeneric reset-buffer-pointer (t) (:documentation "Restore the buffer-pointer to the beginning of the buffer")) (defgeneric advance-buffer (t &optional t1) (:documentation "Eliminate objects at the head of the buffer, returning number removed or NIL if none")) (defgeneric get-buffer-pointer-position (t) (:documentation "Return an integer which gives the index at which the current buffer-pointer is at in the buffer")) (defmethod fill-buffer ((bb backtracking-buffer) &optional (count 1)) (with-slots (buffer buffer-pointer buffer-end input-stream) bb (let ((actual-count 0)) (handler-case (loop repeat count for object = (read-char input-stream) do (cond ((null buffer) (setf buffer (list object)) (setf buffer-end buffer) (setf buffer-pointer buffer)) ((null buffer-end) (setf buffer-end (last buffer)) (setf (rest buffer-end) (list object)) (unless buffer-pointer (setf buffer-pointer buffer))) (t (setf (rest buffer-end) (list object)) (setf buffer-end (rest buffer-end)) (unless buffer-pointer (setf buffer-pointer buffer-end)))) do (incf actual-count)) (end-of-file () nil)) (when (plusp actual-count) actual-count)))) ; return value (defmethod get-next-object ((bb backtracking-buffer)) (with-slots (buffer buffer-pointer) bb (if buffer-pointer (prog1 (first buffer-pointer) (setf buffer-pointer (rest buffer-pointer))) ; advance pointer (when (fill-buffer bb 1) (get-next-object bb))))) (defmethod reset-buffer-pointer ((bb backtracking-buffer)) (with-slots (buffer buffer-pointer) bb (setf buffer-pointer buffer))) (defmethod advance-buffer ((bb backtracking-buffer) &optional (count 1)) (with-slots (buffer buffer-pointer buffer-end input-stream) bb (let ((actual-count 0)) (flet ((advance-buffer-by-1 () ;; if buffer-pointer is at beginning, advance it too (when (eql buffer-pointer buffer) (setf buffer-pointer (rest buffer-pointer))) ;; advance buffer (setf buffer (rest buffer)) ;; if that was the whole buffer, then set buffer-end ;; to nil for consistency (unless buffer (setf buffer-end nil)))) (loop repeat count do (cond ((null buffer) (if (fill-buffer bb (- count actual-count)) (advance-buffer-by-1) (error 'end-of-file :stream input-stream))) (t (advance-buffer-by-1))) do (incf actual-count))) (when (plusp actual-count) actual-count)))) (defmethod get-buffer-pointer-position ((bb backtracking-buffer)) (with-slots (buffer buffer-pointer) bb (loop for cons on buffer until (eql cons buffer-pointer) summing 1))) ;; string search/replace (defgeneric multiple-string-replace (t t1 t2) (:documentation "Applies the substitutions to the given input and sends them to the given output. Returns the output stream or string.")) (defmethod multiple-string-replace (trie (input string) output) (multiple-string-replace trie (make-string-input-stream input) output)) (defmethod multiple-string-replace (trie input (output string)) (with-output-to-string (output-stream output) (multiple-string-replace trie input output-stream))) (defmethod multiple-string-replace (trie input (output (eql 'nil))) (with-output-to-string (output-stream) (multiple-string-replace trie input output-stream))) (defmethod multiple-string-replace ((subs list) input output) (multiple-string-replace (make-trie-from-substitutions subs) input output)) (defmethod multiple-string-replace (trie (input-stream stream) output) (multiple-string-replace trie (make-instance 'backtracking-buffer :input-stream input-stream) output)) (defmethod multiple-string-replace ((trie trie) (buffer backtracking-buffer) (output-stream stream)) (let ((best-match-pair nil)) ; (skip-len . payload) (flet ((backtrack () (reset-buffer-pointer buffer) (reset trie) (multiple-value-bind (skip-len payload) (if best-match-pair (multiple-value-prog1 (values (car best-match-pair) (cdr best-match-pair)) (setf best-match-pair nil)) (values 1 (handler-case (get-next-object buffer) (end-of-file () nil)))) (when payload (princ payload output-stream)) (handler-case (advance-buffer buffer skip-len) (end-of-file () (return-from multiple-string-replace output-stream)))))) (loop for object = (handler-case (get-next-object buffer) (end-of-file () (if (eql (get-buffer-pointer-position buffer) 0) (return-from multiple-string-replace output-stream) (backtrack)))) do (if (descend trie object) (when (payload trie) (setf best-match-pair (cons (get-buffer-pointer-position buffer) (payload trie)))) (backtrack)))))) (defun add-subs-to-trie (trie subs) (loop for (target . replacement) in subs do (add-sequence trie target :payload replacement)) trie) (defun make-trie-from-sequences (seqs &key (payload t)) "Creates a trie from the list of given sequences, and gives each a payload (defaulting to t)." (let ((trie (make-instance 'trie))) (dolist (seq seqs trie) (add-sequence trie seq :payload payload)))) (defun make-trie-from-substitutions (subs) "Creates a trie from the given substitions, an association list of the form (string . replacement), which can be used for various string search/replacement functions." (add-subs-to-trie (make-instance 'trie) subs)) (defun make-string-replacer (subs) "Accepts an association list of the form (string . replacement) and creates a function which when applied to a string argument will search and replace according to the given list of substitutions." (let ((trie (make-trie-from-substitutions subs))) #'(lambda (string) (with-input-from-string (input string) (with-output-to-string (output) (multiple-string-replace trie input output)))))) (defgeneric get-all-sequences (t &optional t1) (:documentation "Return a list of all sequences in the trie at and below the current point. Sequences in the trie are determined by: nodes which have payloads and leaf-nodes. Optionally adds a prefix to the beginning of every sequence returned. Type of the sequences returned depends upon the type of the prefix. Default is list.")) (defmethod get-all-sequences ((seqs list) &optional (prefix nil)) (get-all-sequences (make-trie-from-sequences seqs :payload t) prefix)) (defmethod get-all-sequences ((trie trie) &optional (prefix nil)) (labels ((add-to-prefix (obj) ; fixme (typecase prefix (string (concatenate 'string prefix (string obj))) (vector (concatenate 'vector prefix (vector obj))) (list (append prefix (list obj))))) (recur (orig-node) (map-trie-children #'(lambda (child) (unwind-protect (progn (descend trie child) (get-all-sequences trie (add-to-prefix (object trie)))) (setf (current-node trie) orig-node))) orig-node))) (if (leaf-node-p trie) (list prefix) (let* ((orig-node (current-node trie)) (result-list (when (payload orig-node) (list prefix)))) (nconc result-list (loop for result in (recur orig-node) nconcing result)))))) (defun make-sequence-prefix-searcher (seqs) "Given a list of sequences, returns a function that when given a sequence argument will return all sequences in the original list that begin with the given sequence" (let ((trie (make-trie-from-sequences seqs :payload t))) #'(lambda (prefix) (unwind-protect (block uh-oh (with-sequence-iterator (next prefix) (loop for (more index char) = (multiple-value-list (next)) while more do (unless (descend trie char) (return-from uh-oh nil)))) (get-all-sequences trie prefix)) (reset trie))))) (defun run-tests () (let ((f (make-string-replacer '(("truth" . "falsity") ("no" . "yes") ("yes" . "no") ("blue" . "red") ("false" . "true") ("?" . "!") ("I" . "i") (", " . "---")))) (strings '("It is false that blue is red and yes is no, yes?"))) (dolist (string strings) (format t "~&~A~%~5T=> ~A~%" string (funcall f string)))) (princ (funcall (sequence-s/r::make-sequence-prefix-searcher '((a b c) (a b c d e f) (b a h) (a d i e u))) '(a))) (terpri) (values)) #| notes: ;;; madhu - the trie could be traversed the same order in which it was filled if the input was sorted? map-trie-children should process children in reverse order - suggest that emu use special variable shaow or something instead of unwind-protecT |# (defmethod find-sequence ((trie trie) sequence) ;; returns payload (reset trie) ;; ALSO MESSES UP CURRENT STATE (with-sequence-iterator (next sequence) (loop with child = nil for (more index x) = (multiple-value-list (next)) while more do (setq child (descend trie x)) finally (when child (return (payload child)))))) ;; todo: handle sequences of different types. How? - add a new ;; slot on trie called sequence-type (vector, list, etc). Then use ;; concatenate instead of append based on this sequence type. (defun maptrie (function trie) "Iterates over all entries in the trie that have payloads or are leaves. For each entry, the function is called with two arguments--the sequence and the payload of that entry." (let ((f (coerce function 'function))) (labels ((recursion-considered-harmful (prefix node &aux (o (object node)) (p (payload node)) (x (unless (eq o 'root) ;; x = extended prefix (append prefix (list o))))) ; ugh workaround (cond ((leaf-node-p node) (funcall f x p)) (t (when p (funcall f x p)) (map-trie-children #'(lambda (my-node) (unwind-protect (progn (descend trie my-node) (recursion-considered-harmful x my-node)) (setf (current-node trie) node))) node))))) (reset trie); root (recursion-considered-harmful nil (root-node trie)))))