;;; -*- Mode: LISP; Package: :cl-user; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;;; ;;; Touched: Mon Apr 07 19:13:46 2008 +0530 ;;; Time-stamp: <2008-04-10 18:03:33 madhu> ;;; Bugs-To: enometh@net.meer ;;; Status: Experimental. Do not redistribute. ;;; Copyright (C) 2008 Madhu. All Rights Reserved. ;;; ;;; Parse the binary file formats introduced with Opera version 5 for the ;;; various files used in the cache and cookie management. ;;; ;;; Reference [2008-04-08. Some comments and docstrings are taken from this ;;; document]: ;;; ;;; http://www.opera.com/docs/fileformats/index.html #|| (in-package "CL-USER") (progn ; requires frodef's excellent binary types package (require 'binary-types "cmu/binary-types-0.90/binary-types") (provide 'binary-types)) ||# (defpackage "OPERADAT-1" (:use "CL" "BINARY-TYPES")) (in-package "OPERADAT-1") (define-binary-class file-header () ((file_version_number :binary-type 'u32) (app_version_number :binary-type 'u32) (idtag_length :binary-type 'u16 :documentation "Number of bytes in the id tag, presently 1.") (length_length :binary-type 'u16 :documentation "Number of bytes in the length part of a record, presently 2."))) (defun major-and-minor-version-numbers (version-number) "Return as values the major and minor version numbers. The lower 12 bits (bitmask 0x00000fff) represent the minor version number, the rest is the major version number." (values (ash version-number -12) (logand #x00000fff version-number))) #+nil (major-and-minor-version-numbers #x00001000) (defun read-file-header (stream) ;; Integers used in the format are unsigned, and stored in ;; big-endian/network style (Most Significant Byte first). (let ((*endian* :big-endian)) (read-binary 'file-header stream))) ;;; ---------------------------------------------------------------------- ;;; ;;; TESTING BITS ;;; (defvar +opera-homedir-pathname+ (merge-pathnames (make-pathname :name nil :type nil :version nil :directory (list :relative ".opera")) (user-homedir-pathname) nil)) (defvar +known-files+ ;; ident pathname-name pathname-type pathname-dir desc app-major app-minor '((vlink4.dat "vlink4" "dat" nil "Visited link file." 2 0) (dcache4.url "dcache4" "url" ("cache4") "Disk cache index file." 2 0) (download.dat "download" "dat" nil "Download rescue file." 2 0) (cookies4.dat "cookies4" "dat" nil "Cookie store." 2 0))) (defmacro with-open-known-file-stream ((ident &optional stream) &body body) "IDENT is a key in the assoc list +KNOWN-FILES+. It also becomes the name of the open stream variable, if one is not supplied." (let ((ent (assoc ident +known-files+)) (stream-var (if stream stream ident))) (assert ent) (destructuring-bind (id name type dir desc app-major app-minor) ent (declare (ignorable id desc app-major app-minor)) `(with-open-file (,stream-var (merge-pathnames (make-pathname :name ,name :type ,type :version nil :directory (list :relative ,@dir)) +opera-homedir-pathname+) :element-type '(unsigned-byte 8)) ,@body)))) (defmacro cassert (&rest args) "Internal. Provide a continue restart." `(with-simple-restart (continue "Continue.") (assert ,@args))) (defun check-file-header (file-header) "Internal. Testing." (with-slots (file_version_number idtag_length length_length) file-header (cassert (= file_version_number #x00001000)) (multiple-value-bind (file-major file-minor) (major-and-minor-version-numbers file_version_number) (cassert (= 0 file-minor)) (cassert (= 1 file-major))) (cassert (= idtag_length 1)) (cassert (= length_length 2)))) (defun check-application-version (file-header ident) "Internal. Testing." (with-slots (app_version_number) file-header (multiple-value-bind (application-major application-minor) (major-and-minor-version-numbers app_version_number) (let ((ent (assoc ident +known-files+))) (assert ent) (destructuring-bind (id name type dir desc app-major app-minor) ent (declare (ignore id name type dir desc)) (cassert (= app-minor application-minor)) (cassert (= app-major application-major))))))) #+nil (with-open-known-file-stream (cookies4.dat) (let ((file-header (read-file-header cookies4.dat))) (check-file-header file-header) (check-application-version file-header 'cookies4.dat) file-header)) ;;; ---------------------------------------------------------------------- ;;; ;;; TAG TYPES ;;; (eval-when (:load-toplevel :execute :compile-toplevel) (defvar +tag_id_type-and-payload_length_type-lookup-table+ ;; Value tag_id_type payload_length_type '((1 u8 u8) (2 u16 u16) (3 u24 u24) (4 u32 u32)) "The values of the IDTAG_LENGTH and LENGTH_LENGTH fields gives the number of bytes used in the records for the idtags, as defined by the tag_id_type, and the payload length fields, as defined by payload_length_type, respectively. Specifically, the values of these fields define TAG_ID_TYPE and PAYLOAD_LENGTH_TYPE as the following integer types") (defun fetch-binary-type-for-tag_id_type (idtag_length) (second (assoc idtag_length +tag_id_type-and-payload_length_type-lookup-table+))) (defun fetch-binary-type-for-payload_length_type (length_length) (third (assoc length_length +tag_id_type-and-payload_length_type-lookup-table+)))) (deftype tag_id_type (idtag_length) (fetch-binary-type-for-tag_id_type idtag_length)) (deftype payload_length_type (length_length) (fetch-binary-type-for-payload_length_type length_length)) #|| (typep (expt 2 29) '(payload_length_type 4)) (fetch-binary-type-for-payload_length_type 2) (fetch-binary-type-for-tag_id_type 1) ||# ;;; ---------------------------------------------------------------------- ;;; ;;; RECORD HEADERS ;;; (defmacro define-record-header (record-name idtag_length length_length) (let ((name (intern (concatenate 'string (symbol-name record-name) "-HEADER")))) `(define-binary-class ,name () ((tag_id :type (tag_id_type ,idtag_length) :binary-type ,(fetch-binary-type-for-tag_id_type idtag_length) :documentation "Application specific tag to identify content type. This value is application specific, and can be used to indicate the meaning of the payload content.") (length :type (payload_length_type ,length_length) :binary-type ,(fetch-binary-type-for-payload_length_type length_length) :documentation "Length of payload. This field is the number of bytes in the payload that immediately follow the record header. It may be zero."))))) (define-record-header record 1 2) ; defines "RECORD-HEADER" ;;; ---------------------------------------------------------------------- ;;; ;;; Fallback Generic Data record ;;; (defclass generic-record (record-header) ((bytepayload ;; :typically a binary-string :initarg :payload :documentation "Payload/content of the record."))) ;;; ---------------------------------------------------------------------- ;;; ;;; Support for Specific Data Record types ;;; (defvar +record-registry+ nil) (defun add-to-record-registry (tagid class) (let ((ent (assoc tagid +record-registry+))) (cond (ent (let ((old-entry (cdr ent))) (values (setf (cdr ent) class) old-entry))) (t (push (cons tagid class) +record-registry+))))) #+nil (setq +record-registry+ nil) ;;; ---------------------------------------------------------------------- ;;; ;;; Shenanigans ;;; (defun read-record (stream) (let ((*endian* :big-endian)) (let ((record-header (read-binary 'record-header stream))) (munge-flag-tag record-header) ;FIXME (with-slots (tag_id length) record-header (let ((record-class (cdr (assoc tag_id +record-registry+))) (payload (read-binary-string stream :size length))) (change-class record-header (or record-class 'generic-record) :payload payload)))))) (defun read-records (stream) (let ((records (make-array 0 :adjustable t :fill-pointer t))) (handler-case (loop (vector-push-extend (read-record stream) records)) (end-of-file () records)))) #+nil (let (+record-registry+) (with-open-known-file-stream (cookies4.dat) (check-file-header (read-file-header cookies4.dat)) (read-records cookies4.dat))) ;;; ;;; Extension to binary-types-0.90: ;;; (defmacro with-binary-input-from-string ((stream-var string &key (start 0)) &body body) "Bind STREAM-VAR to an object that, when passed to READ-BINARY, provides 8-bit bytes from STRING, which must yield a vector. Binds *BINARY-READ-BYTE* appropriately. This macro will break if this binding is shadowed." (let ((save-brb-var (make-symbol "save-brb")) (length-var (make-symbol "length"))) `(let* ((,save-brb-var *binary-read-byte*) (,stream-var (cons (1- ,start) ,string)) (,length-var (length (cdr ,stream-var))) (*binary-read-byte* #'(lambda (s) (if (eq s ,stream-var) (let ((index (incf (car s)))) (if (< index ,length-var) (char-code (schar (cdr s) index)) (error 'end-of-file))) (funcall ,save-brb-var s))))) ,@body))) (defmacro define-reader (record-class (stream obj) &body body) "Binds TAG_ID and LENGTH of the current record, in addition to STREAM and OBJ." `(defmethod update-instance-for-different-class :after ((.old. record-header) (,obj ,record-class) &key ((:payload .payload.)) &allow-other-keys) (with-binary-input-from-string (,stream .payload.) ,@body))) ;;; ---------------------------------------------------------------------- ;;; ;;; Data types sections ;;; ;;; Integers used in the format are unsigned, and stored in big-endian/network ;;; style (Most Significant Byte first). Integers stored inside the records ;;; are also stored in the big-endian format, but may be signed, and may be ;;; truncated. (define-unsigned uint32 4 :big-endian) (define-unsigned uint8 1 :big-endian) (define-signed int8 1 :big-endian) ;; time_t: uint32 representing a time value in seconds since 00:00 Jan 1, 1970 ;; GMT. The representation may be increased past 32 bit in the future (define-unsigned time_t 4 :big-endian) ;;; Tag_id values in which the MSB (Most Significant Bit) is set to 1, are ;;; reserved for records with implicit no length. The tag_id field is NOT ;;; followed by a length field, nor a payload buffer. Such records are used as ;;; Boolean flags: True if present, False if not present. ;;; ;;; In the binary storage of a file this means that the MSB of the internal ;;; storage integer must be stored as the MSB of the first byte in the tag ;;; field. When a file is read into a program, the program must take care to ;;; move the MSB of the binary stored tag to a common (internal) bit position, ;;; such as the MSB of the program's own unsigned integers. (defun flagp (tag_id) (logbitp 7 tag_id)) (defun tag_id-value (tag_id) (if (flagp tag_id) (values (dpb 0 (byte 1 7) tag_id) t) tag_id)) (defun munge-flag-tag (record-header) (with-slots (tag_id) record-header (multiple-value-bind (tagid flagp) (tag_id-value tag_id) (when flagp (warn "munge-flag-tag ~S: #X~X -> #X~X" record-header tag_id tagid) (setq tag_id tag_id))))) ;;; ---------------------------------------------------------------------- ;;; ;;; The Cache File Formats. (Description) ;;; (defvar +cache-file-formats+ '( ;; NOTE: Contents `nil' indicates that these records contain FIXME HERE ;; with tags from the same set of tag ids. (madhu) ;; Tag ID Contents Meaning #x01 record "Disk Cache Record." #x02 record "Visited Link record." #x41 record "Download record." #x40 string "A Disk cache index file conains exactly one such record, which contains a 5 character string used to find the next free cache file number (oprXXXXX)." ;; Common Elements Between All Files ;; These elements are used by all of the cache related files. In the case ;; of the visited links, these are the only fields presently used. ;; NOTE: Contents `flag' indicates that the most significant bit in the ;; local unsigned integer is set. (madhu) #x0003 string "The name of the URL, fully qualified." #x0004 time_t "Last visited." #x000b flag "The URL is a result of a form query." #x0022 record "Contains the name and last visited time of relative link in the document. May repeat." ;; Content tags of relative link (tag 0x0022) records #x0023 string "The name of the relative link." #x0024 time_t "Last visited." ;; Fields Used by Disk Cache and Download Rescue Files #x0005 time_t "Localtime, when the file was last loaded, not GMT." #x0007 uint8 "Status of load: 2 Loaded 4 Loading aborted 5 Loading failed." #x0008 uint32 "Content size." #x0009 string "MIME type of content." #x000a string "Character set of content." #x000c flag "File is downloaded and stored locally on user's disk, and is not part of the disk cache directory" #x000d string "Name of file (cache files: only local to cache directory)" #x000f flag "Always check if modified." #x0010 record "Contains the HTTP protocol specific information" ;; Fields used only by the download rescuefile #x0028 time_t "Identifies the time when the loading of the last/previous segment of the downloaded file started." #x0029 time_t "Identifies the time when the loading of the last/previous segment of the downloaded file was stopped." #x002A uint32 "How many bytes were in the previous segement of the file being downloaded. If the time the loading ended is not known, this value will be assumed to be zero (0) and the download speed set to zero (unknown)." ;; Fields used in the HTTP protocol specific record ;; All methods are by default GET, at present it is not possible to cache ;; POST requests. #x0015 string "HTTP date header" #x0016 time_t "Expiry date" #x0017 string "Last modified date" #x0018 string "MIME type of document" #x0019 string "Entity tag" #x001A string "Moved to URL (Location header)" #x001B string "Response line text" #x001C uint32 "Response code" #x001D string "Refresh URL" #x001E uint32 "Refresh delta time" #x001F string "Suggested file name" #x0020 string "Content Encodings" #x0021 string "Content Location" #x0025 uint32 "Together with tag 0x0026 (both must be present) this identifies the User Agent string last used to load the resource. This value identifies the User Agent string. This value is used internally, and should not be modified." #x0026 uint32 "Together with tag 0x0025 (both must be present) this identifies the User Agent string last used to load the resource. This value identifies the User Agent sub version. This value is used internally, and should not be modified." #x0030 flag "Reserved for future use." #x0031 flag "Reserved for future use." )) ;;; ---------------------------------------------------------------------- ;;; ;;; Cookie File format (Description) ;;; (defvar +cookie-file-format+ '( ;; The cookie file is organized as a tree of domain name components, each ;; component then holds a tree of path components and each path component ;; may contain a number of cookies. ;; NOTE: The components are a sequence of records, teminated with a flag ;; record, not a single record. ;; All names of domain components are non-dotted, except IP addresses, ;; which can only be stored with the complete IP address as a Quad dotted ;; string, e.g. "10.11.12.13", are stored at the top level, and cannot ;; contain any subdomains. ;; A Domain Record uses the tag "0x01" and contains a sequence of these ;; fields: ;; Tag ID Contents Meaning #x001E string "The name of the domain part" #x001F int8 "How cookies are filtered for this domain. If not present, the filtering of the parent domain is used. 1. All cookies from this domain are accepted. 2. No cookies from this domain are accepted. 3, All cookies from this server are accepted. Overrides 1 and 2 for higher level domains automatics. 4. No cookies from this server are accepted. Overrides 1 and 2 for higher level domains. Domain settings apply to all subdomains, except those with a server specific selection." #x0021 int8 "Handling of cookies that have explicit paths which do not match the URL setting the cookies. If enabled in the privacy preferences the default is to warn the user, but when warning is enabled such cookies can be filtered by their domains: Value 1 indicates reject, and 2 is accept automatically." #x0025 int8 "While in the \"Warn about third party cookies\" mode, this field can be used to automatically filter such cookies. 1. All third party cookies from this domain are accepted. 2. No third party cookies from this domain are accepted. 3. All third party cookies from this server are accepted. Overrides 1 and 2 for higher level domains automatics. 4. No third party cookies from this server are accepted. Overrides 1 and 2 for higher level domains. Domain settings apply to all subdomains, except those with a server specific selection." ;; This record can be followed by zero or more path components defining ;; toplevel paths on servers in the domain and always terminated by a path ;; component terminator record. Then zero or more domain components may ;; follow. ;; A domain component is terminated by a (0x0004 | MSB_VALUE) flag record. ;; PATH COMPONENTS ;; The path components organize the cookies defined for a given directory ;; in a given domain, as well any subdirectories of this directory that ;; have cookies defined. ;; Except for the path component starting immediately after the domain ;; component record, each path component always starts with a path record, ;; and is then followed by any number of cookie records and subdirectory ;; path components. ;; The path record uses the record id "0x0002" and the record has this ;; field record: #x001D string "The name of the path part The path component terminator is the (0x0005 | MSB_VALUE) flag record." ;; COOKIE RECORDS ;; The cookie entries are stored in records of type "0x0003" and have the ;; following field records: #x0010 string "The name of the cookie." #x0011 string "The value of the cookie." #x0012 time_t "Expiry date." #x0013 time_t "Last used." #x0014 string "Comment/Description of use (RFC 2965)." #x0015 string "URL for Comment/Description of use (RFC 2965)." #x0016 string "The domain received with version=1 cookies (RFC 2965)." #x0017 string "The path received with version=1 cookies (RFC 2965)." #x0018 string "The port limitations received with version=1 cookies (RFC 2965)." #x0019 flag "The cookie will only be sent to HTTPS servers." #x001A int8+ "Version number of cookie (RFC 2965)." #x001B flag "This cookie will only be sent to the server that sent it." #x001C flag "Reserved for delete protection: Not yet implemented." #x0020 flag "This cookie will not be sent if the path is only a prefix of the URL. If the path is /foo, /foo/bar will match but not /foobar." #x0022 flag "If true, this cookie was set as the result of a password login form, or by a URL that was retrieved using a cookie that can be tracked back to such a cookie." #x0023 flag "If true, this cookie was set as the result of a HTTP authentication login, or by a URL that was retrieved using a cookie that can be tracked back to such a cookie." #x0024 flag "In \"Display Third party cookies\" mode this flag will be set if the cookie was set by a third party server, and only these cookies will be sent if the URL is a third party. Cookies that were received when loading a URL from the server directly will not be sent to third party URLs in this mode. The reverse is NOT true. NOTE: If a third party server redirects back to the first party server, the redirected URL is considered third party." )) (defun check-format-desc (desc) (let (tagids types) (loop for (tagid type desc) on desc by #'cdddr do (cassert (not (find tagid tagids)) nil "duplicate tagidx for ~X: ~S" tagid (list tagid type desc)) (push tagid tagids) (pushnew type types)) (list (sort tagids #'<) types))) (defun fetch-record-description (tag_id format-desc) (loop for (tag_id0 content desc) on format-desc by #'cdddr when (= tag_id tag_id0) return (list tag_id0 content desc))) #|| (check-format-desc +cache-file-formats+) (check-format-desc +cookie-file-format+) (fetch-record-description 37 +cookie-file-format+) (fetch-record-description #x41 +cache-file-formats+) ||# ;;; ---------------------------------------------------------------------- ;;; ;;; (defun define-cache-data-records (format-desc) (flet ((make-class-name (tag-id) (with-standard-io-syntax (intern (format nil "CX~,4X" tag-id) "OPERADAT-1")))) ;XXX (loop for (tag_id content desc) on format-desc by #'cdddr for class-name = (make-class-name tag_id) collect (ecase content ((string uint8 uint32 time_t) `(defclass ,class-name (generic-record) nil (:documentation ,desc))) (record `(defclass ,class-name (record-header) ((records)) (:documentation ,desc))) (flag `(defclass ,class-name (record-header) nil (:documentation ,desc)))) into defclass-forms collect (ecase content (string `(define-reader ,class-name (stream obj) (setf (slot-value obj 'bytepayload) (with-slots (length) obj (read-binary-string stream :size length))))) ((uint8 uint32 time_t) `(define-reader ,class-name (stream obj) (setf (slot-value obj 'bytepayload) (read-binary ',content stream)))) (record `(define-reader ,class-name (stream obj) (setf (slot-value obj 'records) (read-records stream)))) (flag)) into defreader-forms collect `(add-to-record-registry ,tag_id ',class-name) into initialization-forms finally (eval `(progn ,@defclass-forms)) (eval `(progn ,@defreader-forms)) (eval `(progn ,@initialization-forms)) (return (list defclass-forms defreader-forms initialization-forms))))) (define-cache-data-records +cache-file-formats+) #+nil (with-open-known-file-stream (dcache4.url stream) (read-file-header stream) (read-records stream)) #+nil (with-open-known-file-stream (download.dat stream) (read-file-header stream) (read-records stream))