;;; XML-poly.el --- XML parser ;; Copyright (C) 2002, 2003 MIYASHITA Hisashi ;; Keywords: XML, namespace, parser ;; XML-poly is free software; you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation; either version 2, or (at your option) ;; any later version. ;; XML-poly is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License ;; along with this program; see the file COPYING. If not, write to the ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, ;; Boston, MA 02111-1307, USA. ;; Comment: ;; XML-poly is an XML parser that can properly treat XML namespace. ;; XML-poly parses XML document to lisp objects as follows. ;; ;; 'NAME means NAME is an symbol of NAME that is interned ;; to the standard obarray. ;; ;; := ( . ( . )) | marker? ;; := point ;; := point ;; := ['XML ] ;; := ('DOCTYPE ;; []) ;; := ('XML-decl ) ;; := ('STAG ] ;; := ('ETAG ) ;; , , , := string ;; := [ ] ;; := symbol ;; := (*) ;; := ( []) ;; := string ;; := ( | <#pcdata> | | )* ;; := ( | )* ;; <#pcdata> := string/('XML-poly-range 'XML-poly-options ) ;; := string/('XML-poly-CDATA t 'XML-poly-range 'XML-poly-options ) ;; := ('PI string ) ;; := string ;; := ('COMMENT string []) ;; := alist ;; ;; All symbols generated by the parser are not belonging to the standard obarray. ;; XML-poly makes one obarray per one XML namespace, and set the symbols value ;; to the obarray. To refer such symbols via string, use XML-poly-intern. (require 'unicode) ;;; parser options (defvar XML-poly-include-comment nil "* Whether comments are put into a parsed object or not.") (defvar XML-poly-normalize-PCDATA nil "* Whether PCDATA is normalized or not.") (defvar XML-poly-resolve-reference t " * Whether it resolves references in CharData or not.") (defvar XML-poly-check-duplicated-attribute t " * Whether it checks duplicated attributes in the same element.") (defvar XML-poly-construct-object t "* Whether XML-poly creates objects when paring a document.") (defvar XML-poly-set-text-property t "* Whether XML-poly set text properties to PCDATA and CDATA.") (defvar XML-poly-pull-interface nil) (defvar XML-poly-use-URI-resolver nil) ;;; error (put 'XML-poly-invalid-obj-error 'error-conditions '(XML-poly-fatal XML-poly-error error)) (put 'XML-poly-invalid-obj-error 'error-message "XML-poly: invalid parsed object.") ;;; range object (defsubst XML-poly-make-range (beg end buf) (cons buf (cons beg end))) (defsubst XML-poly-get-range-beg (range) (car (cdr range))) (defsubst XML-poly-get-range-end (range) (cdr (cdr range))) (defsubst XML-poly-get-range-buf (range) (car range)) ;;; ['XML ] (defsubst XML-poly-create-obj () (vector 'XML nil nil nil nil)) (defsubst XML-poly-get-decl (parsed) (aref parsed 1)) (defsubst XML-poly-set-decl (parsed decl) (aset parsed 1 decl)) (defsubst XML-poly-get-doctype (parsed) (aref parsed 2)) (defsubst XML-poly-set-doctype (parsed doctype) (aset parsed 2 doctype)) (defsubst XML-poly-get-element (parsed) (aref parsed 3)) (defsubst XML-poly-set-element (parsed elem) (aset parsed 3 elem)) (defun XML-poly-object-type (obj) "Return the type of parsed XML OBJ. It returns either of XML, DOCTYPE, ELEMENT, COMMENT, CDATA, PI, or PCDATA." (cond ((vectorp obj) (if (eq (aref obj 0) 'XML) 'XML 'ELEMENT)) ((listp obj) (car obj)) ((stringp obj) (if (get-text-property 0 'XML-poly-CDATA obj) 'CDATA 'PCDATA)) (t (signal 'XML-poly-invalid-obj-error obj)))) (defun XML-poly-get-options (obj) (cond ((vectorp obj) (aref obj 4)) ((stringp obj) (get-text-property 0 'XML-poly-options obj)) ((listp obj) (let ((e (car obj))) (cond ((eq e 'DOCTYPE) (nth 6 obj)) ((eq e 'PI) (nth 4 obj)) ((eq e 'COMMENT) (nth 3 obj)) (t (signal 'XML-poly-invalid-obj-error obj))))) (t (signal 'XML-poly-invalid-obj-error obj)))) (defun XML-poly-set-options (obj alist) (cond ((vectorp obj) (aset obj 4 alist)) ((stringp obj) (put-text-property 0 (length obj) 'XML-poly-options alist obj)) ((listp obj) (let ((e (car obj)) (l (nthcdr (cond ((eq e 'DOCTYPE) 6) ((eq e 'PI) 4) ((eq e 'COMMENT) 3) (t (signal 'XML-poly-invalid-obj-error obj))) obj))) (if l (setcar l alist) (nconc obj (list alist))))) (t (signal 'XML-poly-invalid-obj-error obj)))) (defun XML-poly-get-range (obj) (cond ((vectorp obj) (if (eq (aref obj 0) 'XML) nil (aref obj 3))) ((listp obj) (let ((e (car obj))) (cond ((eq e 'DOCTYPE) (nth 5 obj)) ((eq e 'PI) (nth 3 obj)) ((eq e 'COMMENT) (nth 2 obj)) ((symbolp e) ;; attribute (nth 2 obj)) (t (signal 'XML-poly-invalid-obj-error obj))))) ((stringp obj) (get-text-property 0 'XML-poly-range obj)) (t (signal 'XML-poly-invalid-obj-error obj)))) ;;; Object construcion wrapper (defsubst XML-poly-create-object (x) (if (functionp XML-poly-pull-interface) (funcall XML-poly-pull-interface)) x) ;;; Error message (defvar XML-poly-error-handler nil) (defun XML-poly-parse-error (mes &optional beg end) (if (functionp XML-poly-error-handler) (funcall XML-poly-error-handler mes beg end) (message mes) nil)) ;;; XML symbol manager (defvar XML-poly-namespace-obarray (make-vector 751 0)) (defvar XML-poly-xmlns-obarray (make-vector 23 0)) ;; when namespace URI is nil, it is regarded as local namespace. (defun XML-poly-make-namespace (uri) (let* ((ns (make-vector 17 0)) (usym (intern "/" ns)) (nssym (intern uri XML-poly-namespace-obarray))) (set nssym ns) (set usym uri) ns)) (defsubst XML-poly-get-namespace (uri) (or (symbol-value (intern-soft uri XML-poly-namespace-obarray)) (XML-poly-make-namespace uri))) (defun XML-poly-intern (name uri) (let* ((ns (XML-poly-get-namespace uri)) (sym (intern name ns))) (set sym ns) sym)) (defsubst XML-poly-intern-for-ns (name ns) (setq name (intern name ns)) (set name ns) name) (defun XML-poly-intern-soft (name uri) (let ((sym (intern-soft uri XML-poly-namespace-obarray))) (if sym (intern-soft name (symbol-value sym))))) (defun XML-poly-get-symbol-namespace (sym) (symbol-value sym)) (defsubst XML-poly-get-namespace-URI (ns) (symbol-value (intern "/" ns))) (defsubst XML-poly-intern-xmlns-symbol (prefix) (let ((sym (intern prefix XML-poly-xmlns-obarray))) (set sym 'xmlns) sym)) (defsubst XML-poly-xmlns-symbol-p (sym) (and (boundp sym) (eq (symbol-value sym) 'xmlns))) ;; ;; XML-poly iterator (Sliding part? :-P) ;; ;; [buf point path ns-prefices obj] (defsubst XML-poly-handle-get-buffer (h) (aref h 0)) (defsubst XML-poly-handle-get-point (h) (aref h 1)) (defsubst XML-poly-handle-set-point (h pt) (aset h 1 pt)) (defsubst XML-poly-handle-get-path-list (h) (aref h 2)) (defsubst XML-poly-handle-get-path (h) (car (aref h 2))) (defsubst XML-poly-handle-push-path (h name) (aset h 2 (cons name (aref h 2)))) (defsubst XML-poly-handle-pop-path (h) (aset h 2 (cdr (aref h 2)))) (defsubst XML-poly-handle-set-obj (h obj) (aset h 4 obj) obj) (defsubst XML-poly-handle-get-obj (h) (aref h 4)) (defconst XML-poly-default-prefix-alist (list (cons "xml:" (XML-poly-get-namespace "http://www.w3.org/XML/1998/namespace")))) (defsubst XML-poly-handle-get-ns-prefix-alist (h) (car (aref h 3))) (defsubst XML-poly-handle-add-ns-prefix (h p) (setcar (aref h 3) (cons p (car (aref h 3))))) (defsubst XML-poly-handle-push-ns-prefix-alist (h) (aset h 3 (cons (car (aref h 3)) (aref h 3)))) (defsubst XML-poly-handle-pop-ns-prefix-alist (h) (aset h 3 (cdr (aref h 3)))) (defun XML-poly-make-new-handle (buf pt) (vector buf pt nil (list XML-poly-default-prefix-alist) nil)) (defun XML-poly-make-duplicate-handle (h) (setq h (copy-sequence h)) (aset h 3 (copy-sequence (aref h 3))) h) (defconst XML-poly-local-namespace (XML-poly-get-namespace "")) (defconst XML-poly-invalid-namespace (XML-poly-get-namespace "http://www.meadowy.org/XML-poly/invalid")) (defconst XML-poly-base (XML-poly-intern "base" "http://www.w3.org/XML/1998/namespace")) (defsubst XML-poly-add-namespace-prefix (h prefix uri) (XML-poly-handle-add-ns-prefix h (cons prefix (XML-poly-get-namespace uri)))) (defsubst XML-poly-prefix-namespace (h prefix) (or (cdr (assoc prefix (XML-poly-handle-get-ns-prefix-alist h))) (if prefix (progn (XML-poly-parse-error (format "Prefix:%S is not defined in thie document." prefix) (point)) XML-poly-invalid-namespace)) XML-poly-local-namespace)) (defsubst XML-poly-intern-for-prefixed-name (h name prefix) (let* ((ns (XML-poly-prefix-namespace h prefix)) (sym (intern name ns))) (set sym ns) sym)) ;;; XML S(space) and Eq(eqaul). (defconst XML-poly-S-regexp "[ \t\n\r]+") (defconst XML-poly-S*-regexp "[ \t\n\r]*") (defconst XML-poly-not-S-regexp "[^ \t\n\r]") (defconst XML-poly-Eq-regexp (concat XML-poly-S*-regexp "?=" XML-poly-S*-regexp "?")) ;; Strictly speaking, the below definition is incorrect. ;; We should use character category table to define it. (defconst XML-poly-name-regexp "\\([^ \t\n\r%?<>&/+,.:=]*:\\)?\\([^ \t\n\r%?<>&/+,.:=]+\\)") (defconst XML-poly-ncname-regexp "[^ \t\n\r%?<>&/+,.:=]+") (defconst XML-poly-PItarget-regexp "\\([^ \t\n\r%?<>&/+,.:=]+\\)") (defconst XML-poly-doctype-name-regexp "\\([^ \t\n\r%?<>&/+,.:=]+\\)") ;;; entity reference (defconst XML-poly-entity-reference-regexp "&\\([^;]+\\);") (defconst XML-poly-character10n-entity-reference-regexp "\\`#\\([0-9]+\\)\\'") (defconst XML-poly-character16n-entity-reference-regexp "\\`#x\\([0-9a-fA-F]+\\)\\'") (defun XML-poly-resolve-reference (str) (if XML-poly-resolve-reference (let ((s 0) m r) (while (string-match XML-poly-entity-reference-regexp str s) (setq m (match-string 1 str) s (1+ (match-beginning 0))) (save-match-data (cond ((string-match XML-poly-character10n-entity-reference-regexp m) (setq r (char-to-string (ucs-to-char (string-to-number (match-string 1 m) 10))))) ((string-match XML-poly-character16n-entity-reference-regexp m) (setq r (char-to-string (ucs-to-char (string-to-number (match-string 1 m) 16))))) (t ;; TODO... (setq r nil) ))) (if r (setq str (replace-match r t t str)))))) str) ;; parameter entity reference ;; character reference ;;; ;;; processing part ;;; ;;; util. (defsubst XML-poly-next-token () (let ((r (search-forward-regexp XML-poly-not-S-regexp nil t))) (if r (backward-char 1)) r)) (defsubst XML-poly-skip-token () (let ((r (search-forward-regexp XML-poly-S-regexp nil t))) (if r (backward-char 1)) r)) ;;; XML-decl (defconst XML-poly-version-number-regexp "\\('[-a-zA-Z0-9_.:]+'\\|\"[-a-zA-Z0-9_.:]+\"\\)") (defconst XML-poly-encoding-name-regexp "\\('[A-Za-z][-A-Za-z0-9._]*'\\|\"[A-Za-z][-A-Za-z0-9._]*\"\\)") (defconst XML-poly-yesno-regexp "\\('yes'\\|\"yes\"\\|'no'\\|\"no\"\\)") (defconst XML-poly-decl-head-regexp (concat "<\\?xml" XML-poly-S-regexp)) (defconst XML-poly-decl-regexp (concat "<\\?xml" XML-poly-S-regexp "version" XML-poly-Eq-regexp XML-poly-version-number-regexp "\\(" XML-poly-S-regexp "encoding" XML-poly-Eq-regexp XML-poly-encoding-name-regexp "\\)?" "\\(" XML-poly-S-regexp "standalone" XML-poly-Eq-regexp XML-poly-yesno-regexp "\\)?" XML-poly-S*-regexp "\\?>")) (defun XML-poly-parse-decl () (if (looking-at XML-poly-decl-regexp) (let ((beg (point)) (version (match-string 1)) (encoding (match-string 3)) (standalone (match-string 5)) (dend (match-end 0))) (goto-char dend) (list 'XML-decl (list (cons 'version version) (cons 'encoding encoding) (cons 'standalonep (and standalone (if (string-match "yes" standalone) t)))) (XML-poly-make-range beg dend (current-buffer)))) (XML-poly-parse-error "Invalid XML declaration." (point) (XML-poly-skip-token)) nil)) ;;; DOCTYPE (defconst XML-poly-doctype-head-regexp (concat "")) (defun XML-poly-parse-doctype () (if (looking-at XML-poly-doctype-regexp) (let ((beg (point)) (name (match-string 1)) (pub (match-string 4)) (sys (or (match-string 5) (match-string 6))) (markupdecl (match-string 8)) (dend (match-end 0))) (goto-char dend) (list 'DOCTYPE name pub sys markupdecl (XML-poly-make-range beg dend (current-buffer)))) (XML-poly-parse-error "Invalid XML declaration." (point) (XML-poly-skip-token)) nil)) (defsubst XML-poly-get-DOCTYPE-range (doctype) (nth 5 doctype)) ;;; PI (defconst XML-poly-PI-regexp (concat "<\\?" XML-poly-PItarget-regexp)) (defun XML-poly-parse-PI () (let ((beg (point)) name pst) (if (and (looking-at XML-poly-PI-regexp) (progn (setq name (match-string 1) pst (match-end 0)) (goto-char pst) (search-forward "?>" nil t))) (list 'PI name (buffer-substring pst (- (point) 2)) (XML-poly-make-range beg (point) (current-buffer))) (XML-poly-parse-error "Invalid PI." (point) (XML-poly-skip-token))))) ;;; comment (defun XML-poly-parse-COMMENT () (let ((begin (point))) (if (search-forward "-->" nil t) (progn (list 'COMMENT (buffer-substring (+ begin 4) ;; the length of "" (point) (XML-poly-skip-token)) nil))) ;;; text (defun XML-poly-extract-text (beg end &optional props fwd) (if (null fwd) (setq fwd beg) (setq fwd (+ beg fwd))) (if XML-poly-set-text-property (let ((text (XML-poly-resolve-reference (buffer-substring fwd end)))) (set-text-properties 0 (length text) (cons 'XML-poly-range (cons (XML-poly-make-range beg end (current-buffer)) props)) text) text) (XML-poly-resolve-reference (buffer-substring fwd end)))) ;;; CDATA ;; Note that we assume to be matched with "" nil t) (progn (XML-poly-extract-text begin (match-beginning 0) '(XML-poly-CDATA t) 9)) ;; the length of "" (point) (XML-poly-skip-token)) nil))) ;;; PCDATA (defun XML-poly-parse-PCDATA (begin) (goto-char begin) (if (search-forward "<" nil t) (progn (backward-char) (XML-poly-extract-text begin (point))) (XML-poly-parse-error "Invalid PCDATA" (point) (XML-poly-skip-token)) nil)) ;;; attribute (defsubst XML-poly-get-attribute-name (a) (car a)) (defsubst XML-poly-get-attribute-value (a) (nth 1 a)) (defconst XML-poly-attribute-value-regexp "\\(\"[^<\"]*\"\\|'[^<']*'\\)") (defconst XML-poly-attribute-regexp (concat "\\(" XML-poly-S-regexp XML-poly-name-regexp XML-poly-Eq-regexp XML-poly-attribute-value-regexp "\\)")) (defun XML-poly-parse-attributes (h) (let (prefix name elem value ps pe r result) (while (looking-at XML-poly-attribute-regexp) (setq prefix (match-string 2) name (match-string 3) value (match-string 4) ps (match-beginning 0) pe (match-end 0) value (XML-poly-resolve-reference (substring value 1 (1- (length value))))) (if prefix (progn (if (string= prefix "xmlns:") (progn (XML-poly-add-namespace-prefix h (concat name ":") value) (setq name (XML-poly-intern-xmlns-symbol name)))) (setq r (cons (list prefix name value (cons ps pe)) r))) (if (string= name "xmlns") (progn (XML-poly-add-namespace-prefix h nil value) (setq name (XML-poly-intern-xmlns-symbol "")))) (setq r (cons (list nil name value (cons ps pe)) r))) (goto-char pe)) (while r (setq elem (car r) name (nth 1 elem) prefix (car elem) result (cons (list (if (symbolp name) name (if prefix (XML-poly-intern-for-prefixed-name h name prefix) (XML-poly-intern-for-ns name XML-poly-local-namespace))) (nth 2 elem) (XML-poly-make-range ps pe (current-buffer))) result) r (cdr r))) (XML-poly-next-token) result)) ;;; tag (defconst XML-poly-stag-head-regexp (concat "<" XML-poly-name-regexp ;; 1:2 )) (defconst XML-poly-etag-regexp (concat "")) (defun XML-poly-parse-tag (h) (let ((begin (point)) name prefix atts) (cond ((looking-at XML-poly-etag-regexp) (setq prefix (match-string 1) name (match-string 2)) (setq name (XML-poly-intern-for-prefixed-name h name prefix)) (goto-char (match-end 0)) (list 'ETAG name (XML-poly-make-range begin (point) (current-buffer)) nil)) ((looking-at XML-poly-stag-head-regexp) (setq prefix (match-string 1) name (match-string 2)) (goto-char (match-end 0)) (XML-poly-handle-push-ns-prefix-alist h) (setq atts (XML-poly-parse-attributes h)) (setq name (XML-poly-intern-for-prefixed-name h name prefix)) (cond ((looking-at "/>") ;; empty element (goto-char (match-end 0)) (XML-poly-handle-pop-ns-prefix-alist h) (XML-poly-create-element-obj name atts ; attributes nil ; contents (XML-poly-make-range begin (point) (current-buffer)) nil)) ((looking-at ">") ;; start tag (goto-char (match-end 0)) (XML-poly-handle-push-path h name) (list 'STAG name atts (XML-poly-make-range begin (point) (current-buffer)) nil)) (t (XML-poly-parse-error "Cannot find the tag's end." begin (XML-poly-skip-token))))) (t (XML-poly-parse-error "Invalid tag." begin (XML-poly-skip-token)))))) ;;; element ;;; [ ] (defsubst XML-poly-create-element-obj (name attrs contents range options) (vector name attrs contents range options)) (defsubst XML-poly-get-element-name (obj) (aref obj 0)) (defsubst XML-poly-get-element-attribute-list (obj) (aref obj 1)) (defsubst XML-poly-get-element-contents (obj) (aref obj 2)) (defsubst XML-poly-set-element-contents (obj c) (aset obj 2 c)) (defsubst XML-poly-get-element-range (obj) (aref obj 3)) (defsubst XML-poly-get-element-options (obj) (aref obj 4)) (defun XML-poly-search-etag (prefix name end) (let ((regexp (if prefix (concat "\\)") (concat "\\)"))) (depth 1) e1 e2) (while (and (re-search-forward regexp end t) (setq e1 (match-beginning 0) e2 (match-end 0)) (if (eq (char-after (1+ e1)) ?/) (setq depth (1- depth)) (setq depth (1+ depth))) (> depth 0))) (if (= depth 0) (cons e1 e2)))) ;;; ;;; primitive/contents ;;; (defun XML-poly-parse-primitive (h) (let ((prev (point)) (begin (XML-poly-next-token)) r) (XML-poly-handle-set-obj h (if begin (cond ((looking-at "<\\?") (cond ((looking-at XML-poly-decl-head-regexp) (XML-poly-parse-decl)) (t (XML-poly-parse-PI)))) ((looking-at "