1888 lines
83 KiB
Racket
1888 lines
83 KiB
Racket
|
#lang racket/base
|
||
|
;; Copyright Neil Van Dyke. For legal info, see file "info.rkt".
|
||
|
|
||
|
(require mcfly)
|
||
|
|
||
|
(module+ test
|
||
|
(require overeasy))
|
||
|
|
||
|
(doc (section "Introduction")
|
||
|
|
||
|
(para "The "
|
||
|
(code "html-parsing")
|
||
|
" library provides a permissive HTML parser. The parser is useful
|
||
|
for software agent extraction of information from Web pages, for
|
||
|
programmatically transforming HTML files, and for implementing interactive Web
|
||
|
browsers. "
|
||
|
(code "html-parsing")
|
||
|
" emits "
|
||
|
;; TODO: 2016-02-21 Once create sxml-doc package, reference that.
|
||
|
(seclink "top"
|
||
|
#:doc '(lib "sxml-intro/sxml-intro.scrbl")
|
||
|
#:indirect? #true
|
||
|
"SXML/xexp")
|
||
|
", so that conventional HTML may be processed with XML tools such as
|
||
|
SXPath. Like Oleg Kiselyov's "
|
||
|
(hyperlink "http://pobox.com/~oleg/ftp/Scheme/xml.html#HTML-parser"
|
||
|
"SSAX-based HTML parser")
|
||
|
", "
|
||
|
(code "html-parsing")
|
||
|
" provides a permissive tokenizer, but "
|
||
|
(code "html-parsing")
|
||
|
" extends this by attempting to recover syntactic structure.")
|
||
|
|
||
|
(para "The "
|
||
|
(code "html-parsing")
|
||
|
" parsing behavior is permissive in that it accepts erroneous HTML,
|
||
|
handling several classes of HTML syntax errors gracefully, without yielding a
|
||
|
parse error. This is crucial for parsing arbitrary real-world Web pages, since
|
||
|
many pages actually contain syntax errors that would defeat a strict or
|
||
|
validating parser. "
|
||
|
(code "html-parsing")
|
||
|
"'s handling of errors is intended to generally emulate popular Web
|
||
|
browsers' interpretation of the structure of erroneous HTML.")
|
||
|
(para (code "html-parsing")
|
||
|
" also has some support for XHTML, although XML namespace qualifiers
|
||
|
are accepted but stripped from the resulting SXML/xexp. Note that "
|
||
|
(italic "valid")
|
||
|
" XHTML input might be better handled by a validating XML parser
|
||
|
like Kiselyov's SSAX."))
|
||
|
|
||
|
;; BEGIN COPIED FROM XEXP PACKAGE
|
||
|
|
||
|
(define (%html-parsing:make-xexp-char-ref val)
|
||
|
(if (or (symbol? val) (integer? val))
|
||
|
`(& ,val)
|
||
|
(error 'make-xexp-char-ref
|
||
|
"invalid xexp reference value: ~S"
|
||
|
val)))
|
||
|
|
||
|
(define %html-parsing:always-empty-html-elements
|
||
|
'(area base br frame hr img input isindex keygen link meta param
|
||
|
spacer wbr))
|
||
|
|
||
|
;; END COPIED FROM XEXP PACKAGE
|
||
|
|
||
|
(define %html-parsing:empty-token-symbol '*empty*)
|
||
|
(define %html-parsing:end-token-symbol '*end*)
|
||
|
(define %html-parsing:start-token-symbol '*start*)
|
||
|
(define %html-parsing:entity-token-symbol '*entity*)
|
||
|
(define %html-parsing:text-string-token-symbol '*text-string*)
|
||
|
(define %html-parsing:text-char-token-symbol '*text-char*)
|
||
|
|
||
|
(define %html-parsing:make-html-tokenizer
|
||
|
;; TODO: Have the tokenizer replace contiguous whitespace within individual
|
||
|
;; text tokens with single space characters (except for when in `pre' and
|
||
|
;; verbatim elements). The parser will introduce new contiguous whitespace
|
||
|
;; (e.g., when text tokens are concatenated, invalid end tags are removed,
|
||
|
;; whitespace is irrelevant between certain elements), but then the parser
|
||
|
;; only has to worry about the first and last character of each string.
|
||
|
;; Perhaps the text tokens should have both leading and trailing whitespace
|
||
|
;; stripped, and contain flags for whether or not leading and trailing
|
||
|
;; whitespace occurred.
|
||
|
(letrec ((no-token '())
|
||
|
|
||
|
;; TODO: Maybe make these three variables options.
|
||
|
|
||
|
(verbatim-to-eof-elems '(plaintext))
|
||
|
|
||
|
(verbatim-pair-elems '(script server style xmp))
|
||
|
|
||
|
(ws-chars (list #\space
|
||
|
(integer->char 9)
|
||
|
(integer->char 10)
|
||
|
(integer->char 11)
|
||
|
(integer->char 12)
|
||
|
(integer->char 13)))
|
||
|
|
||
|
(gosc/string-or-false
|
||
|
(lambda (os)
|
||
|
(let ((s (get-output-string os)))
|
||
|
(if (string=? s "") #f s))))
|
||
|
|
||
|
(gosc/symbol-or-false
|
||
|
(lambda (os)
|
||
|
(let ((s (gosc/string-or-false os)))
|
||
|
(if s (string->symbol s) #f))))
|
||
|
)
|
||
|
(lambda (in normalized?)
|
||
|
;; TODO: Make a tokenizer option that causes XML namespace qualifiers to
|
||
|
;; be ignored.
|
||
|
(letrec
|
||
|
(
|
||
|
;; Port buffer with inexpensive unread of one character and slightly
|
||
|
;; more expensive pushback of second character to unread. The
|
||
|
;; procedures themselves do no consing. The tokenizer currently
|
||
|
;; needs two-symbol lookahead, due to ambiguous "/" while parsing
|
||
|
;; element and attribute names, which could be either empty-tag
|
||
|
;; syntax or XML qualified names.
|
||
|
(c #f)
|
||
|
(next-c #f)
|
||
|
(c-consumed? #t)
|
||
|
(read-c (lambda ()
|
||
|
(if c-consumed?
|
||
|
(if next-c
|
||
|
(begin (set! c next-c)
|
||
|
(set! next-c #f))
|
||
|
(set! c (read-char in)))
|
||
|
(set! c-consumed? #t))))
|
||
|
(unread-c (lambda ()
|
||
|
(if c-consumed?
|
||
|
(set! c-consumed? #f)
|
||
|
;; TODO: Procedure name in error message really
|
||
|
;; isn't "%html-parsing:make-html-tokenizer"...
|
||
|
(error '%html-parsing:make-html-tokenizer
|
||
|
"already unread: ~S"
|
||
|
c))))
|
||
|
(push-c (lambda (new-c)
|
||
|
(if c-consumed?
|
||
|
(begin (set! c new-c)
|
||
|
(set! c-consumed? #f))
|
||
|
(if next-c
|
||
|
(error '%html-parsing:make-html-tokenizer
|
||
|
"pushback full: ~S"
|
||
|
c)
|
||
|
(begin (set! next-c c)
|
||
|
(set! c new-c)
|
||
|
(set! c-consumed? #f))))))
|
||
|
|
||
|
;; TODO: These procedures are a temporary convenience for
|
||
|
;; enumerating the pertinent character classes, with an eye towards
|
||
|
;; removing redundant tests of character class. These procedures
|
||
|
;; should be eliminated in a future version.
|
||
|
(c-eof? (lambda () (eof-object? c)))
|
||
|
(c-amp? (lambda () (eqv? c #\&)))
|
||
|
(c-apos? (lambda () (eqv? c #\')))
|
||
|
(c-bang? (lambda () (eqv? c #\!)))
|
||
|
(c-colon? (lambda () (eqv? c #\:)))
|
||
|
(c-quot? (lambda () (eqv? c #\")))
|
||
|
(c-equals? (lambda () (eqv? c #\=)))
|
||
|
(c-gt? (lambda () (eqv? c #\>)))
|
||
|
(c-lsquare? (lambda () (eqv? c #\[)))
|
||
|
(c-lt? (lambda () (eqv? c #\<)))
|
||
|
(c-minus? (lambda () (eqv? c #\-)))
|
||
|
(c-pound? (lambda () (eqv? c #\#)))
|
||
|
(c-ques? (lambda () (eqv? c #\?)))
|
||
|
(c-semi? (lambda () (eqv? c #\;)))
|
||
|
(c-slash? (lambda () (eqv? c #\/)))
|
||
|
(c-splat? (lambda () (eqv? c #\*)))
|
||
|
(c-lf? (lambda () (eqv? c #\newline)))
|
||
|
(c-angle? (lambda () (memv c '(#\< #\>))))
|
||
|
(c-ws? (lambda () (memv c ws-chars)))
|
||
|
(c-alpha? (lambda () (char-alphabetic? c)))
|
||
|
(c-digit? (lambda () (char-numeric? c)))
|
||
|
(c-alphanum? (lambda () (or (c-alpha?) (c-digit?))))
|
||
|
(c-hexlet? (lambda () (memv c '(#\a #\b #\c #\d #\e #\f
|
||
|
#\A #\B #\C #\D #\E #\F))))
|
||
|
|
||
|
(skip-ws (lambda () (read-c) (if (c-ws?) (skip-ws) (unread-c))))
|
||
|
|
||
|
(if-read-chars
|
||
|
(lambda (match-chars yes-thunk no-proc)
|
||
|
(let loop ((chars match-chars)
|
||
|
(match-count 0))
|
||
|
(if (null? chars)
|
||
|
(yes-thunk)
|
||
|
(begin (read-c)
|
||
|
(if (eqv? c (car chars))
|
||
|
(begin (loop (cdr chars) (+ 1 match-count)))
|
||
|
(begin (unread-c)
|
||
|
(no-proc match-chars match-count))))))))
|
||
|
|
||
|
(write-chars-count
|
||
|
(lambda (chars count port)
|
||
|
(let loop ((chars chars)
|
||
|
(count count))
|
||
|
(or (zero? count)
|
||
|
(begin (write-char (car chars) port)
|
||
|
(loop (cdr chars)
|
||
|
(- count 1)))))))
|
||
|
|
||
|
(make-start-token
|
||
|
(if normalized?
|
||
|
(lambda (name ns attrs)
|
||
|
(list name (cons '@ attrs)))
|
||
|
(lambda (name ns attrs)
|
||
|
(if (null? attrs)
|
||
|
(list name)
|
||
|
(list name (cons '@ attrs))))))
|
||
|
|
||
|
(make-empty-token
|
||
|
(lambda (name ns attrs)
|
||
|
(cons %html-parsing:empty-token-symbol
|
||
|
(make-start-token name ns attrs))))
|
||
|
|
||
|
(make-end-token
|
||
|
(if normalized?
|
||
|
(lambda (name ns attrs)
|
||
|
(list %html-parsing:end-token-symbol
|
||
|
name
|
||
|
(cons '@ attrs)))
|
||
|
(lambda (name ns attrs)
|
||
|
(if (null? attrs)
|
||
|
(list %html-parsing:end-token-symbol name)
|
||
|
(list %html-parsing:end-token-symbol
|
||
|
name
|
||
|
(cons '@ attrs))))))
|
||
|
|
||
|
(make-comment-token
|
||
|
(lambda (str) (list '*COMMENT* str)))
|
||
|
|
||
|
(make-decl-token
|
||
|
(lambda (parts) (cons '*DECL* parts)))
|
||
|
|
||
|
(scan-qname
|
||
|
;; TODO: Make sure we don't accept local names that have "*", since
|
||
|
;; this can break SXML tools. Have to validate this afterwards if
|
||
|
;; "verbatim-safe?". Also check for "@" and maybe "@@". Check
|
||
|
;; qname parsing code, especially for verbatim mode. This is
|
||
|
;; important!
|
||
|
(lambda (verbatim-safe?)
|
||
|
;; Note: If we accept some invalid local names, we only need two
|
||
|
;; symbols of lookahead to determine the end of a qname.
|
||
|
(letrec ((os #f)
|
||
|
(ns '())
|
||
|
(vcolons 0)
|
||
|
(good-os (lambda ()
|
||
|
(or os
|
||
|
(begin (set! os (open-output-string))
|
||
|
os)))))
|
||
|
(let loop ()
|
||
|
(read-c)
|
||
|
(cond ((c-eof?) #f)
|
||
|
((or (c-ws?) (c-splat?))
|
||
|
(if verbatim-safe?
|
||
|
(unread-c)
|
||
|
#f))
|
||
|
((or (c-angle?) (c-equals?) (c-quot?) (c-apos?))
|
||
|
(unread-c))
|
||
|
((c-colon?)
|
||
|
(or (null? ns)
|
||
|
(set! ns (cons ":" ns)))
|
||
|
(if os
|
||
|
(begin
|
||
|
(set! ns (cons (get-output-string os)
|
||
|
ns))
|
||
|
(set! os #f))
|
||
|
#f)
|
||
|
(loop))
|
||
|
((c-slash?)
|
||
|
(read-c)
|
||
|
(cond ((or (c-eof?)
|
||
|
(c-ws?)
|
||
|
(c-equals?)
|
||
|
(c-apos?)
|
||
|
(c-quot?)
|
||
|
(c-angle?)
|
||
|
(c-splat?))
|
||
|
(unread-c)
|
||
|
(push-c #\/))
|
||
|
(else (write-char #\/ (good-os))
|
||
|
(write-char c os)
|
||
|
(loop))))
|
||
|
(else (write-char c (good-os))
|
||
|
(loop))))
|
||
|
(let ((ns (if (null? ns)
|
||
|
#f
|
||
|
(apply string-append
|
||
|
(reverse ns))))
|
||
|
(localname (if os (get-output-string os) #f)))
|
||
|
(if verbatim-safe?
|
||
|
;; TODO: Make sure we don't have ambiguous ":" or drop
|
||
|
;; any characters!
|
||
|
(cons ns localname)
|
||
|
;; Note: We represent "xml:" and "xmlns:" syntax as
|
||
|
;; normal qnames, for lack of something better to do with
|
||
|
;; them when we don't support XML namespaces.
|
||
|
;;
|
||
|
;; TODO: Local names are currently forced to lowercase,
|
||
|
;; since HTML is usually case-insensitive. If XML
|
||
|
;; namespaces are used, we might wish to keep local names
|
||
|
;; case-sensitive.
|
||
|
(if localname
|
||
|
(if ns
|
||
|
(if (or (string=? ns "xml")
|
||
|
(string=? ns "xmlns"))
|
||
|
(string->symbol (string-append ns
|
||
|
":"
|
||
|
localname))
|
||
|
(cons ns
|
||
|
(string->symbol (string-downcase
|
||
|
localname))))
|
||
|
(string->symbol (string-downcase localname)))
|
||
|
(if ns
|
||
|
(string->symbol (string-downcase ns))
|
||
|
;; TODO: Ensure in rest of code that returning #f
|
||
|
;; as a name here is OK.
|
||
|
#f)))))))
|
||
|
|
||
|
(scan-tag
|
||
|
(lambda (start?)
|
||
|
(skip-ws)
|
||
|
(let ((tag-name (scan-qname #f))
|
||
|
(tag-ns #f)
|
||
|
(tag-attrs #f)
|
||
|
(tag-empty? #f))
|
||
|
;; Scan element name.
|
||
|
(if (pair? tag-name)
|
||
|
(begin (set! tag-ns (car tag-name))
|
||
|
(set! tag-name (cdr tag-name)))
|
||
|
#f)
|
||
|
;; TODO: Ensure there's no case in which a #f tag-name isn't
|
||
|
;; compensated for later.
|
||
|
;;
|
||
|
;; Scan element attributes.
|
||
|
(set! tag-attrs
|
||
|
(let scan-attr-list ()
|
||
|
(read-c)
|
||
|
(cond ((c-eof?) '())
|
||
|
((c-angle?) (unread-c) '())
|
||
|
((c-slash?)
|
||
|
(set! tag-empty? #t)
|
||
|
(scan-attr-list))
|
||
|
((c-alpha?)
|
||
|
(unread-c)
|
||
|
(let ((attr (scan-attr)))
|
||
|
(cons attr (scan-attr-list))))
|
||
|
(else (scan-attr-list)))))
|
||
|
;; Find ">" or unnatural end.
|
||
|
(let loop ()
|
||
|
(read-c)
|
||
|
(cond ((c-eof?) no-token)
|
||
|
((c-slash?) (set! tag-empty? #t) (loop))
|
||
|
((c-gt?) #f)
|
||
|
((c-ws?) (loop))
|
||
|
(else (unread-c))))
|
||
|
;; Change the tokenizer mode if necessary.
|
||
|
(cond ((not start?) #f)
|
||
|
(tag-empty? #f)
|
||
|
;; TODO: Maybe make one alist lookup here, instead of
|
||
|
;; two.
|
||
|
((memq tag-name verbatim-to-eof-elems)
|
||
|
(set! nexttok verbeof-nexttok))
|
||
|
((memq tag-name verbatim-pair-elems)
|
||
|
(set! nexttok (make-verbpair-nexttok tag-name))))
|
||
|
;; Return a token object.
|
||
|
(if start?
|
||
|
(if tag-empty?
|
||
|
(make-empty-token tag-name tag-ns tag-attrs)
|
||
|
(make-start-token tag-name tag-ns tag-attrs))
|
||
|
(make-end-token tag-name tag-ns tag-attrs)))))
|
||
|
|
||
|
(scan-attr
|
||
|
(lambda ()
|
||
|
(let ((name (scan-qname #f))
|
||
|
(val #f))
|
||
|
(if (pair? name)
|
||
|
(set! name (cdr name))
|
||
|
#f)
|
||
|
(let loop-equals-or-end ()
|
||
|
(read-c)
|
||
|
(cond ((c-eof?) no-token)
|
||
|
((c-ws?) (loop-equals-or-end))
|
||
|
((c-equals?)
|
||
|
(let loop-quote-or-unquoted ()
|
||
|
(read-c)
|
||
|
(cond ((c-eof?) no-token)
|
||
|
((c-ws?) (loop-quote-or-unquoted))
|
||
|
((or (c-apos?) (c-quot?))
|
||
|
(let ((term c))
|
||
|
(set! val (open-output-string))
|
||
|
(let loop-quoted-val ()
|
||
|
(read-c)
|
||
|
(cond ((c-eof?) #f)
|
||
|
((eqv? c term) #f)
|
||
|
((c-amp?) (let ((entity (scan-entity)))
|
||
|
(display entity val)
|
||
|
(loop-quoted-val)))
|
||
|
(else (write-char c val)
|
||
|
(loop-quoted-val))))))
|
||
|
((c-angle?) (unread-c))
|
||
|
(else
|
||
|
(set! val (open-output-string))
|
||
|
(write-char c val)
|
||
|
(let loop-unquoted-val ()
|
||
|
(read-c)
|
||
|
(cond ((c-eof?) no-token)
|
||
|
((c-apos?) #f)
|
||
|
((c-quot?) #f)
|
||
|
((or (c-ws?) (c-angle?)
|
||
|
;;(c-slash?)
|
||
|
)
|
||
|
(unread-c))
|
||
|
;; Note: We can treat a slash in an
|
||
|
;; unquoted attribute value as a
|
||
|
;; value constituent because the
|
||
|
;; slash is specially-handled only
|
||
|
;; for XHTML, and XHTML attribute
|
||
|
;; values must always be quoted. We
|
||
|
;; could do lookahead for "/>", but
|
||
|
;; that wouldn't let us parse HTML
|
||
|
;; "<a href=/>" correctly, so this is
|
||
|
;; an easier and more correct way to
|
||
|
;; do things.
|
||
|
(else (write-char c val)
|
||
|
(loop-unquoted-val))))))))
|
||
|
(else (unread-c))))
|
||
|
(if normalized?
|
||
|
(list name (if val
|
||
|
(get-output-string val)
|
||
|
(symbol->string name)))
|
||
|
(if val
|
||
|
(list name (get-output-string val))
|
||
|
(list name))))))
|
||
|
|
||
|
(scan-comment
|
||
|
;; TODO: Rewrite this to use tail recursion rather than a state
|
||
|
;; variable.
|
||
|
(lambda ()
|
||
|
(let ((os (open-output-string))
|
||
|
(state 'start-minus))
|
||
|
(let loop ()
|
||
|
(read-c)
|
||
|
(cond ((c-eof?) #f)
|
||
|
((c-minus?)
|
||
|
(set! state
|
||
|
(case state
|
||
|
((start-minus) 'start-minus-minus)
|
||
|
((start-minus-minus body) 'end-minus)
|
||
|
((end-minus) 'end-minus-minus)
|
||
|
((end-minus-minus) (write-char #\- os) state)
|
||
|
(else (error '<%html-parsing:make-html-tokenizer>
|
||
|
"invalid state: ~S"
|
||
|
state))))
|
||
|
(loop))
|
||
|
((and (c-gt?) (eq? state 'end-minus-minus)) #f)
|
||
|
(else (case state
|
||
|
((end-minus) (write-char #\- os))
|
||
|
((end-minus-minus) (display "--" os)))
|
||
|
(set! state 'body)
|
||
|
(write-char c os)
|
||
|
(loop))))
|
||
|
(make-comment-token (get-output-string os)))))
|
||
|
|
||
|
(scan-possible-cdata
|
||
|
(lambda ()
|
||
|
;; Read "<!" and current character is "[", so try to read the
|
||
|
;; rest of the CDATA start delimeter.
|
||
|
(if-read-chars
|
||
|
'(#\C #\D #\A #\T #\A #\[)
|
||
|
(lambda ()
|
||
|
;; Successfully read CDATA section start delimiter, so read
|
||
|
;; the section.
|
||
|
(scan-cdata))
|
||
|
(lambda (chars count)
|
||
|
;; Did not read rest of CDATA section start delimiter, so
|
||
|
;; return a string for what we did read.
|
||
|
(let ((os (open-output-string)))
|
||
|
(display "<![" os)
|
||
|
(write-chars-count chars count os)
|
||
|
(get-output-string os))))))
|
||
|
|
||
|
(scan-cdata
|
||
|
(lambda ()
|
||
|
(let ((os (open-output-string)))
|
||
|
(let loop ()
|
||
|
(if-read-chars
|
||
|
'(#\] #\] #\>)
|
||
|
(lambda () (get-output-string os))
|
||
|
(lambda (chars count)
|
||
|
(if (zero? count)
|
||
|
(if (eof-object? c)
|
||
|
(get-output-string os)
|
||
|
(begin (write-char c os)
|
||
|
(read-c)
|
||
|
(loop)))
|
||
|
(begin (write-char #\] os)
|
||
|
(if (= count 2)
|
||
|
(push-c #\])
|
||
|
#f)
|
||
|
(loop)))))))))
|
||
|
|
||
|
(scan-pi
|
||
|
(lambda ()
|
||
|
(skip-ws)
|
||
|
(let ((name (open-output-string))
|
||
|
(val (open-output-string)))
|
||
|
(let scan-name ()
|
||
|
(read-c)
|
||
|
(cond ((c-eof?) #f)
|
||
|
((c-ws?) #f)
|
||
|
((c-alpha?) (write-char c name) (scan-name))
|
||
|
(else (unread-c))))
|
||
|
;; TODO: Do we really want to emit #f for PI name?
|
||
|
(set! name (gosc/symbol-or-false name))
|
||
|
(let scan-val ()
|
||
|
(read-c)
|
||
|
(cond ((c-eof?) #f)
|
||
|
;; ((c-amp?) (display (scan-entity) val)
|
||
|
;; (scan-val))
|
||
|
((c-ques?)
|
||
|
(read-c)
|
||
|
(cond ((c-eof?) (write-char #\? val))
|
||
|
((c-gt?) #f)
|
||
|
(else (write-char #\? val)
|
||
|
(unread-c)
|
||
|
(scan-val))))
|
||
|
(else (write-char c val) (scan-val))))
|
||
|
(list '*PI*
|
||
|
name
|
||
|
(get-output-string val)))))
|
||
|
|
||
|
(scan-decl
|
||
|
;; TODO: Find if SXML includes declaration forms, and if so, use
|
||
|
;; whatever format SXML wants.
|
||
|
;;
|
||
|
;; TODO: Rewrite to eliminate state variables.
|
||
|
(letrec
|
||
|
((scan-parts
|
||
|
(lambda ()
|
||
|
(let ((part (open-output-string))
|
||
|
(nonsymbol? #f)
|
||
|
(state 'before)
|
||
|
(last? #f))
|
||
|
(let loop ()
|
||
|
(read-c)
|
||
|
(cond ((c-eof?) #f)
|
||
|
((c-ws?)
|
||
|
(case state
|
||
|
((before) (loop))
|
||
|
((quoted) (write-char c part) (loop))))
|
||
|
((and (c-gt?) (not (eq? state 'quoted)))
|
||
|
(set! last? #t))
|
||
|
((and (c-lt?) (not (eq? state 'quoted)))
|
||
|
(unread-c))
|
||
|
((c-quot?)
|
||
|
(case state
|
||
|
((before) (set! state 'quoted) (loop))
|
||
|
((unquoted) (unread-c))
|
||
|
((quoted) #f)))
|
||
|
(else
|
||
|
(if (eq? state 'before)
|
||
|
(set! state 'unquoted)
|
||
|
#f)
|
||
|
(set! nonsymbol? (or nonsymbol?
|
||
|
(not (c-alphanum?))))
|
||
|
(write-char c part)
|
||
|
(loop))))
|
||
|
(set! part (get-output-string part))
|
||
|
(if (string=? part "")
|
||
|
'()
|
||
|
(cons (if (or (eq? state 'quoted) nonsymbol?)
|
||
|
part
|
||
|
;; TODO: Normalize case of things we make
|
||
|
;; into symbols here.
|
||
|
(string->symbol part))
|
||
|
(if last?
|
||
|
'()
|
||
|
(scan-parts))))))))
|
||
|
(lambda () (make-decl-token (scan-parts)))))
|
||
|
|
||
|
(special-entity-reverse-chars-to-string-alist
|
||
|
'(((#\p #\m #\a) . "&")
|
||
|
((#\s #\o #\p #\a) . "'")
|
||
|
((#\t #\g) . ">")
|
||
|
((#\t #\l) . "<")
|
||
|
((#\t #\o #\u #\q) . "\"")))
|
||
|
|
||
|
(finish-terminated-named-entity
|
||
|
(lambda (reverse-name-chars)
|
||
|
(cond ((equal? '() reverse-name-chars)
|
||
|
"&")
|
||
|
((assoc reverse-name-chars
|
||
|
special-entity-reverse-chars-to-string-alist)
|
||
|
=> (lambda (p)
|
||
|
(cdr p)))
|
||
|
(else (%html-parsing:make-xexp-char-ref
|
||
|
(string->symbol (apply string (reverse reverse-name-chars))))))))
|
||
|
|
||
|
(finish-unterminated-named-entity
|
||
|
(lambda (reverse-name-chars)
|
||
|
(apply string (cons #\& (reverse reverse-name-chars)))))
|
||
|
|
||
|
(scan-entity
|
||
|
(lambda ()
|
||
|
(read-c)
|
||
|
(cond ((c-eof?) "&")
|
||
|
((c-alpha?)
|
||
|
;; TODO: Do entity names have a maximum length?
|
||
|
(let loop ((reverse-name-chars (cons c '())))
|
||
|
(read-c)
|
||
|
(cond ((c-eof?) (finish-unterminated-named-entity
|
||
|
reverse-name-chars))
|
||
|
((c-alpha?) (let ((reverse-name-chars (cons c reverse-name-chars)))
|
||
|
(cond ((assoc reverse-name-chars
|
||
|
special-entity-reverse-chars-to-string-alist)
|
||
|
=> (lambda (p)
|
||
|
(read-c)
|
||
|
(or (c-semi?)
|
||
|
(unread-c))
|
||
|
(cdr p)))
|
||
|
(else (loop reverse-name-chars)))))
|
||
|
((c-semi?) (finish-terminated-named-entity
|
||
|
reverse-name-chars))
|
||
|
(else (unread-c)
|
||
|
(finish-unterminated-named-entity
|
||
|
reverse-name-chars)))))
|
||
|
((c-pound?)
|
||
|
(let ((num (open-output-string))
|
||
|
(hex? #f))
|
||
|
(read-c)
|
||
|
(cond ((c-eof?) #f)
|
||
|
((memv c '(#\x #\X)) (set! hex? #t) (read-c)))
|
||
|
(let loop ()
|
||
|
(cond ((c-eof?) #f)
|
||
|
((c-semi?) #f)
|
||
|
((or (c-digit?) (and hex? (c-hexlet?)))
|
||
|
(write-char c num)
|
||
|
(read-c)
|
||
|
(loop))
|
||
|
(else (unread-c))))
|
||
|
(set! num (get-output-string num))
|
||
|
(if (string=? num "")
|
||
|
"&#;"
|
||
|
(let ((n (string->number num (if hex? 16 10))))
|
||
|
(if (<= 32 n 126)
|
||
|
(string (integer->char n))
|
||
|
(string (integer->char n)))))))
|
||
|
(else (unread-c) "&"))))
|
||
|
|
||
|
(normal-nexttok
|
||
|
(lambda ()
|
||
|
(read-c)
|
||
|
(cond ((c-eof?) no-token)
|
||
|
((c-lt?)
|
||
|
(let loop ()
|
||
|
(read-c)
|
||
|
(cond ((c-eof?) "<")
|
||
|
;; ((c-ws?) (loop))
|
||
|
((c-slash?) (scan-tag #f))
|
||
|
((c-ques?) (scan-pi))
|
||
|
((c-alpha?) (unread-c) (scan-tag #t))
|
||
|
((c-bang?)
|
||
|
(read-c)
|
||
|
(if (c-lsquare?)
|
||
|
(scan-possible-cdata)
|
||
|
(let loop ()
|
||
|
(cond ((c-eof?) no-token)
|
||
|
((c-ws?) (read-c) (loop))
|
||
|
((c-minus?) (scan-comment))
|
||
|
(else (unread-c)
|
||
|
(scan-decl))))))
|
||
|
(else (unread-c) "<"))))
|
||
|
((c-gt?) ">")
|
||
|
(else (let ((os (open-output-string)))
|
||
|
(let loop ()
|
||
|
(cond ((c-eof?) #f)
|
||
|
((c-angle?) (unread-c))
|
||
|
((c-amp?)
|
||
|
(let ((entity (scan-entity)))
|
||
|
(if (string? entity)
|
||
|
(begin (display entity os)
|
||
|
(read-c)
|
||
|
(loop))
|
||
|
(let ((saved-nexttok nexttok))
|
||
|
(set! nexttok
|
||
|
(lambda ()
|
||
|
(set! nexttok
|
||
|
saved-nexttok)
|
||
|
entity))))))
|
||
|
(else (write-char c os)
|
||
|
(or (c-lf?)
|
||
|
(begin (read-c) (loop))))))
|
||
|
(let ((text (get-output-string os)))
|
||
|
(if (equal? text "")
|
||
|
(nexttok)
|
||
|
text)))))))
|
||
|
|
||
|
(verbeof-nexttok
|
||
|
(lambda ()
|
||
|
(read-c)
|
||
|
(if (c-eof?)
|
||
|
no-token
|
||
|
(let ((os (open-output-string)))
|
||
|
(let loop ()
|
||
|
(or (c-eof?)
|
||
|
(begin (write-char c os)
|
||
|
(or (c-lf?)
|
||
|
(begin (read-c) (loop))))))
|
||
|
(get-output-string os)))))
|
||
|
|
||
|
(make-verbpair-nexttok
|
||
|
(lambda (elem-name)
|
||
|
(lambda ()
|
||
|
(let ((os (open-output-string)))
|
||
|
;; Accumulate up to a newline-terminated line.
|
||
|
(let loop ()
|
||
|
(read-c)
|
||
|
(cond ((c-eof?)
|
||
|
;; Got EOF in verbatim context, so set the normal
|
||
|
;; nextok procedure, then fall out of loop.
|
||
|
(set! nexttok normal-nexttok))
|
||
|
((c-lt?)
|
||
|
;; Got "<" in verbatim context, so get next
|
||
|
;; character.
|
||
|
(read-c)
|
||
|
(cond ((c-eof?)
|
||
|
;; Got "<" then EOF, so set to the normal
|
||
|
;; nexttok procedure, add the "<" to the
|
||
|
;; verbatim string, and fall out of loop.
|
||
|
(set! nexttok normal-nexttok)
|
||
|
(write-char #\< os))
|
||
|
((c-slash?)
|
||
|
;; Got "</", so...
|
||
|
(read-c)
|
||
|
(cond
|
||
|
((c-eof?)
|
||
|
(display "</" os))
|
||
|
((c-alpha?)
|
||
|
;; Got "</" followed by alpha, so unread
|
||
|
;; the alpha, scan qname, compare...
|
||
|
(unread-c)
|
||
|
(let* ((vqname (scan-qname #t))
|
||
|
(ns (car vqname))
|
||
|
(local (cdr vqname)))
|
||
|
;; Note: We ignore XML namespace
|
||
|
;; qualifier for purposes of comparison.
|
||
|
;;
|
||
|
;; Note: We're interning strings here for
|
||
|
;; comparison when in theory there could
|
||
|
;; be many such unique interned strings
|
||
|
;; in a valid HTML document, although in
|
||
|
;; practice this should not be a problem.
|
||
|
(if (and local
|
||
|
(eqv? (string->symbol
|
||
|
(string-downcase local))
|
||
|
elem-name))
|
||
|
;; This is the terminator tag, so
|
||
|
;; scan to the end of it, set the
|
||
|
;; nexttok, and fall out of the loop.
|
||
|
(begin
|
||
|
(let scan-to-end ()
|
||
|
(read-c)
|
||
|
(cond ((c-eof?) #f)
|
||
|
((c-gt?) #f)
|
||
|
((c-lt?) (unread-c))
|
||
|
((c-alpha?)
|
||
|
(unread-c)
|
||
|
;; Note: This is an
|
||
|
;; expensive way to skip
|
||
|
;; over an attribute, but
|
||
|
;; in practice more
|
||
|
;; verbatim end tags will
|
||
|
;; not have attributes.
|
||
|
(scan-attr)
|
||
|
(scan-to-end))
|
||
|
(else (scan-to-end))))
|
||
|
(set! nexttok
|
||
|
(lambda ()
|
||
|
(set! nexttok
|
||
|
normal-nexttok)
|
||
|
(make-end-token
|
||
|
elem-name #f '()))))
|
||
|
;; This isn't the terminator tag, so
|
||
|
;; add to the verbatim string the
|
||
|
;; "</" and the characters of what we
|
||
|
;; were scanning as a qname, and
|
||
|
;; recurse in the loop.
|
||
|
(begin
|
||
|
(display "</" os)
|
||
|
(if ns
|
||
|
(begin (display ns os)
|
||
|
(display ":" os))
|
||
|
#f)
|
||
|
(if local
|
||
|
(display local os)
|
||
|
#f)
|
||
|
(loop)))))
|
||
|
(else
|
||
|
;; Got "</" and non-alpha, so unread new
|
||
|
;; character, add the "</" to verbatim
|
||
|
;; string, then loop.
|
||
|
(unread-c)
|
||
|
(display "</" os)
|
||
|
(loop))))
|
||
|
(else
|
||
|
;; Got "<" and non-slash, so unread the new
|
||
|
;; character, write the "<" to the verbatim
|
||
|
;; string, then loop.
|
||
|
(unread-c)
|
||
|
(write-char #\< os)
|
||
|
(loop))))
|
||
|
(else
|
||
|
;; Got non-"<" in verbatim context, so just add it
|
||
|
;; to the buffer, then, if it's not a linefeed, fall
|
||
|
;; out of the loop so that the token can be
|
||
|
;; returned.
|
||
|
(write-char c os)
|
||
|
(or (c-lf?) (loop)))))
|
||
|
;; Return the accumulated line string, if non-null, or call
|
||
|
;; nexttok.
|
||
|
(or (gosc/string-or-false os) (nexttok))))))
|
||
|
|
||
|
(nexttok #f))
|
||
|
|
||
|
(set! nexttok normal-nexttok)
|
||
|
(lambda () (nexttok))))))
|
||
|
|
||
|
(define (%html-parsing:tokenize-html in normalized?)
|
||
|
(let ((next-tok (%html-parsing:make-html-tokenizer in normalized?)))
|
||
|
(let loop ((tok (next-tok)))
|
||
|
(if (null? tok)
|
||
|
'()
|
||
|
(cons tok (loop (next-tok)))))))
|
||
|
|
||
|
(define (%html-parsing:xexp-token-kind token)
|
||
|
(cond ((string? token) %html-parsing:text-string-token-symbol)
|
||
|
((char? token) %html-parsing:text-char-token-symbol)
|
||
|
((list? token)
|
||
|
(let ((s (car token)))
|
||
|
(if (memq s `(*COMMENT*
|
||
|
*DECL*
|
||
|
*PI*
|
||
|
,%html-parsing:empty-token-symbol
|
||
|
,%html-parsing:end-token-symbol
|
||
|
,%html-parsing:entity-token-symbol))
|
||
|
s
|
||
|
%html-parsing:start-token-symbol)))
|
||
|
(else (error '%html-parsing:xexp-token-kind
|
||
|
"unrecognized token kind: ~S"
|
||
|
token))))
|
||
|
|
||
|
(doc (section "Interface"))
|
||
|
|
||
|
;; @defvar %html-parsing:empty-elements
|
||
|
;;
|
||
|
;; List of names of HTML element types that have no content, represented as a
|
||
|
;; list of symbols. This is used internally by the parser and encoder. The
|
||
|
;; effect of mutating this list is undefined.
|
||
|
|
||
|
;; TODO: Document exactly which elements these are, after we make the new
|
||
|
;; parameterized parser constructor.
|
||
|
|
||
|
(define %html-parsing:empty-elements
|
||
|
(cons '& %html-parsing:always-empty-html-elements))
|
||
|
|
||
|
;; @defproc parse-html/tokenizer tokenizer normalized?
|
||
|
;;
|
||
|
;; Emits a parse tree like @code{html->xexp} and related procedures, except
|
||
|
;; using @var{tokenizer} as a source of tokens, rather than tokenizing from an
|
||
|
;; input port. This procedure is used internally, and generally should not be
|
||
|
;; called directly.
|
||
|
|
||
|
(define %html-parsing:parse-html/tokenizer
|
||
|
;; Note: This algorithm was originally written in 2001 (as part of the first
|
||
|
;; Scheme library the author ever wrote), and then on 2009-08-16 was revamped
|
||
|
;; to not use mutable pairs, for PLT 4 compatibility. It could still use
|
||
|
;; some work to be more FP, but it works for now.
|
||
|
(letrec ((empty-elements
|
||
|
;; TODO: Maybe make this an option.
|
||
|
%html-parsing:empty-elements)
|
||
|
(h-elem-parents
|
||
|
;; Am doing this kludge mainly for mid-1990s HTML that uses the `p`
|
||
|
;; element wrong. Trying to get all appropriate parents other than
|
||
|
;; `p` that I can, to reduce breaking other code.
|
||
|
'(a article aside blink blockquote body center footer form header html li main nav section slot template))
|
||
|
(parent-constraints
|
||
|
;; TODO: Maybe make this an option.
|
||
|
`((area . (map span))
|
||
|
(body . (html))
|
||
|
(caption . (table))
|
||
|
(colgroup . (table))
|
||
|
(dd . (dl))
|
||
|
(dt . (dl))
|
||
|
(frame . (frameset))
|
||
|
(head . (html))
|
||
|
(h1 . ,h-elem-parents)
|
||
|
(h2 . ,h-elem-parents)
|
||
|
(h3 . ,h-elem-parents)
|
||
|
(h4 . ,h-elem-parents)
|
||
|
(h5 . ,h-elem-parents)
|
||
|
(h6 . ,h-elem-parents)
|
||
|
(isindex . (head))
|
||
|
(li . (dir menu ol ul))
|
||
|
(meta . (head))
|
||
|
(noframes . (frameset))
|
||
|
(option . (select))
|
||
|
(p . (blockquote body details figcaption html li td th))
|
||
|
(param . (applet))
|
||
|
(tbody . (table))
|
||
|
(td . (tr))
|
||
|
(th . (tr))
|
||
|
(thead . (table))
|
||
|
(title . (head))
|
||
|
(tr . (table tbody thead))))
|
||
|
(token-kinds-that-always-get-added
|
||
|
`(*COMMENT*
|
||
|
*DECL*
|
||
|
*PI*
|
||
|
,%html-parsing:entity-token-symbol
|
||
|
,%html-parsing:text-string-token-symbol
|
||
|
,%html-parsing:text-char-token-symbol))
|
||
|
(start-tag-name (lambda (tag-token) (car tag-token)))
|
||
|
(end-tag-name (lambda (tag-token) (list-ref tag-token 1))))
|
||
|
(lambda (tokenizer normalized?)
|
||
|
;;(log-html-parsing-debug "(%html-parsing:parse-html/tokenizer ~S ~S)" tokenizer normalized?)
|
||
|
(let ((begs (list (vector #f '()))))
|
||
|
(letrec ((add-thing-as-child-of-current-beg
|
||
|
(lambda (tok)
|
||
|
(let ((beg (car begs)))
|
||
|
(vector-set! beg 1 (cons tok (vector-ref beg 1))))))
|
||
|
|
||
|
(beg->elem
|
||
|
(lambda (beg)
|
||
|
(let ((elem-name (vector-ref beg 0))
|
||
|
(attrs-and-contents (reverse (vector-ref beg 1))))
|
||
|
(cons elem-name attrs-and-contents))))
|
||
|
|
||
|
(finish-current-beg-and-return-elem
|
||
|
(lambda ()
|
||
|
(let ((elem (beg->elem (car begs))))
|
||
|
(set! begs (cdr begs))
|
||
|
(or (null? begs)
|
||
|
(add-thing-as-child-of-current-beg elem))
|
||
|
elem)))
|
||
|
|
||
|
(finish-current-beg
|
||
|
(lambda ()
|
||
|
(finish-current-beg-and-return-elem)))
|
||
|
|
||
|
(finish-all-begs-and-return-top
|
||
|
(lambda ()
|
||
|
(let loop ()
|
||
|
(let ((elem (finish-current-beg-and-return-elem)))
|
||
|
(if (car elem)
|
||
|
(loop)
|
||
|
(cdr elem))))))
|
||
|
|
||
|
(finish-begs-up-to-and-including-name
|
||
|
(lambda (name)
|
||
|
;; (log-html-parsing-debug "(finish-begs-up-to-and-including-name ~S)" name)
|
||
|
(let loop-find-name ((find-begs begs)
|
||
|
(depth 1))
|
||
|
(let ((beg-name (vector-ref (car find-begs) 0)))
|
||
|
(cond ((not beg-name)
|
||
|
;; We reached the root without finding a
|
||
|
;; matching beg, so don't finish anything.
|
||
|
;;
|
||
|
;; TODO: 2022-04-02: Consider having a `*TOP*`
|
||
|
;; kludge in `parent-constraints` that's checked
|
||
|
;; here, especially for handling mid-1990s HTML
|
||
|
;; `p` element (so that we can keep `p` from
|
||
|
;; being a child of `p` even when there's no
|
||
|
;; parent `body` or `html` element).
|
||
|
(void))
|
||
|
((eqv? name beg-name)
|
||
|
;; We found a match, so finish the begs up to
|
||
|
;; depth.
|
||
|
(let loop-finish ((depth depth))
|
||
|
(or (zero? depth)
|
||
|
(begin
|
||
|
(finish-current-beg)
|
||
|
(loop-finish (- depth 1))))))
|
||
|
(else
|
||
|
;; Didn't find a match yet, and there's still at
|
||
|
;; least one more beg to look at, so recur.
|
||
|
(loop-find-name (cdr find-begs)
|
||
|
(+ depth 1))))))))
|
||
|
|
||
|
(finish-begs-upto-but-not-including-names
|
||
|
(lambda (names)
|
||
|
;; (log-html-parsing-debug "(finish-begs-upto-but-not-including-names ~S)" names)
|
||
|
;; (log-html-parsing-debug "begs = ~S" begs)
|
||
|
(let loop-find-name ((find-begs begs)
|
||
|
(depth 0))
|
||
|
(let ((beg-name (vector-ref (car find-begs) 0)))
|
||
|
(cond ((not beg-name)
|
||
|
;; We reached the root without finding a
|
||
|
;; matching beg, so simply discard it.
|
||
|
(void))
|
||
|
((memq beg-name names)
|
||
|
;; We found a match, so finish the begs up to
|
||
|
;; depth.
|
||
|
(let loop-finish ((depth depth))
|
||
|
(or (zero? depth)
|
||
|
(begin
|
||
|
(finish-current-beg)
|
||
|
(loop-finish (- depth 1))))))
|
||
|
(else
|
||
|
;; Didn't find a match yet, and there's still at
|
||
|
;; least one more beg to look at, so recur.
|
||
|
(loop-find-name (cdr find-begs)
|
||
|
(+ depth 1)))))))))
|
||
|
|
||
|
(let loop ()
|
||
|
(let ((tok (tokenizer)))
|
||
|
(if (null? tok)
|
||
|
(finish-all-begs-and-return-top)
|
||
|
(let ((kind (%html-parsing:xexp-token-kind tok)))
|
||
|
;; (log-html-parsing-debug "kind = ~S" kind)
|
||
|
(cond ((memv kind token-kinds-that-always-get-added)
|
||
|
(add-thing-as-child-of-current-beg tok))
|
||
|
((eqv? kind %html-parsing:start-token-symbol)
|
||
|
;; (log-html-parsing-debug "is-start-token-symbol")
|
||
|
(let* ((name (start-tag-name tok))
|
||
|
(cell (assq name parent-constraints)))
|
||
|
;; (log-html-parsing-debug "name = ~S cell = ~S" name cell)
|
||
|
(and cell
|
||
|
(finish-begs-upto-but-not-including-names
|
||
|
(cons 'div (cdr cell))))
|
||
|
(if (memq name empty-elements)
|
||
|
(add-thing-as-child-of-current-beg tok)
|
||
|
(set! begs (cons (vector (car tok)
|
||
|
(cdr tok))
|
||
|
begs)))))
|
||
|
((eqv? kind %html-parsing:empty-token-symbol)
|
||
|
;; Empty tag token, so just add it to current
|
||
|
;; beginning while stripping off leading `*EMPTY*'
|
||
|
;; symbol so that the token becomes normal SXML
|
||
|
;; element syntax.
|
||
|
(add-thing-as-child-of-current-beg (cdr tok)))
|
||
|
((eqv? kind %html-parsing:end-token-symbol)
|
||
|
(let ((name (end-tag-name tok)))
|
||
|
(if name
|
||
|
;; Try to finish to a start tag matching this
|
||
|
;; end tag. If none, just drop the token,
|
||
|
;; though we used to add it to the current
|
||
|
;; beginning.
|
||
|
(finish-begs-up-to-and-including-name
|
||
|
name)
|
||
|
;; We have an anonymous end tag, so match it
|
||
|
;; with the most recent beginning. If no
|
||
|
;; beginning to match, then just drop the
|
||
|
;; token, though we used to add it to the
|
||
|
;; current beginning.
|
||
|
(and (vector-ref (car begs) 0)
|
||
|
(finish-current-beg)))))
|
||
|
(else (error 'parse-html/tokenizer
|
||
|
"unknown tag kind: ~S"
|
||
|
kind)))
|
||
|
(loop))))))))))
|
||
|
|
||
|
;; TODO: Quote of message to a user:
|
||
|
;;
|
||
|
;; >I think this behavior is due to HtmlPrag's use in "parse-html/tokenizer"
|
||
|
;; >of its local "parent-constraints" variable.
|
||
|
;; >
|
||
|
;; >The following line of code from the variable binding expresses the
|
||
|
;; >constraint that any "p" element can have as immediate parent element
|
||
|
;; >only "body", "td", or "th":
|
||
|
;; >
|
||
|
;; > (p . (body td th))
|
||
|
;; >
|
||
|
;; >I think I know a good heuristic for dealing with unfamiliar but
|
||
|
;; >seemingly well-formed elements, like "page" in this case, but I'm afraid
|
||
|
;; >I don't have time to implement it right now. (I am job-hunting right
|
||
|
;; >now, and there are many other coding things I need to do first.)
|
||
|
;; >
|
||
|
;; >Would adding "page" to the above line of the HtmlPrag source code work
|
||
|
;; >around the current problem, or do you need a better solution right now?
|
||
|
|
||
|
;; @defproc %parse-html input normalized? top?
|
||
|
;;
|
||
|
;; This procedure is now used internally by @code{html->xexp} and its
|
||
|
;; variants, and should not be used directly by programs. The interface is
|
||
|
;; likely to change in future versions of HtmlPrag.
|
||
|
|
||
|
(define (%html-parsing:parse-html input normalized? top?)
|
||
|
(let ((parse
|
||
|
(lambda ()
|
||
|
(%html-parsing:parse-html/tokenizer
|
||
|
(%html-parsing:make-html-tokenizer
|
||
|
(cond ((input-port? input) input)
|
||
|
((string? input) (open-input-string input))
|
||
|
(else (error
|
||
|
'%html-parsing:parse-html
|
||
|
"invalid input type: ~E"
|
||
|
input)))
|
||
|
normalized?)
|
||
|
normalized?))))
|
||
|
(if top?
|
||
|
(cons '*TOP* (parse))
|
||
|
(parse))))
|
||
|
|
||
|
;; @defproc html->sxml-0nf input
|
||
|
;; @defprocx html->sxml-1nf input
|
||
|
;; @defprocx html->sxml-2nf input
|
||
|
;; @defprocx html->sxml input
|
||
|
(doc (defproc (html->xexp (input (or/c input-port? string?)))
|
||
|
xexp
|
||
|
|
||
|
(para "Parse HTML permissively from "
|
||
|
(racket input)
|
||
|
", which is either an input port or a string, and emit an
|
||
|
SXML/xexp equivalent or approximation. To borrow and slightly modify an
|
||
|
example from Kiselyov's discussion of his HTML parser:")
|
||
|
|
||
|
(racketinput
|
||
|
(html->xexp
|
||
|
(string-append
|
||
|
"<html><head><title></title><title>whatever</title></head>"
|
||
|
"<body> <a href=\"url\">link</a><p align=center>"
|
||
|
"<ul compact style=\"aa\"> <p>BLah<!-- comment <comment> -->"
|
||
|
" <i> italic <b> bold <tt> ened</i> still < bold </b>"
|
||
|
"</body><P> But not done yet...")))
|
||
|
(racketresultblock
|
||
|
(*TOP* (html (head (title) (title "whatever"))
|
||
|
(body "\n"
|
||
|
(a (@ (href "url")) "link")
|
||
|
(p (@ (align "center"))
|
||
|
(ul (@ (compact) (style "aa")) "\n"))
|
||
|
(p "BLah"
|
||
|
(*COMMENT* " comment <comment> ")
|
||
|
" "
|
||
|
(i " italic " (b " bold " (tt " ened")))
|
||
|
"\n"
|
||
|
"still < bold "))
|
||
|
(p " But not done yet..."))))
|
||
|
|
||
|
(para "Note that, in the emitted SXML/xexp, the text token "
|
||
|
(racket "still < bold")
|
||
|
" is "
|
||
|
(italic "not")
|
||
|
" inside the "
|
||
|
(racketfont "b")
|
||
|
" element. This is one old Web browser quirk-handling of invalid
|
||
|
HTML that this parser does not try to emulate.")))
|
||
|
;; (: html->xexp ((U String InputPort) -> Xexp))
|
||
|
(provide html->xexp)
|
||
|
(define (html->xexp input)
|
||
|
(%html-parsing:parse-html input #f #t))
|
||
|
|
||
|
;; (define-logger html-parsing)
|
||
|
|
||
|
(module+ test
|
||
|
|
||
|
(test (html->xexp "<a>>") '(*TOP* (a ">")))
|
||
|
(test (html->xexp "<a<>") '(*TOP* (a "<" ">")))
|
||
|
|
||
|
(test (html->xexp "<>") '(*TOP* "<" ">"))
|
||
|
(test (html->xexp "< >")
|
||
|
;; `(*TOP* "<" ">")
|
||
|
'(*TOP* "<" " " ">"))
|
||
|
(test (html->xexp "< a>")
|
||
|
;; `(*TOP* (a))
|
||
|
'(*TOP* "<" " a" ">"))
|
||
|
(test (html->xexp "< a / >")
|
||
|
;; `(*TOP* (a))
|
||
|
'(*TOP* "<" " a / " ">"))
|
||
|
|
||
|
(test (html->xexp "<a<") '(*TOP* (a "<")))
|
||
|
(test (html->xexp "<a<b") '(*TOP* (a (b))))
|
||
|
|
||
|
(test (html->xexp "><a>") '(*TOP* ">" (a)))
|
||
|
|
||
|
(test (html->xexp "</>") '(*TOP*))
|
||
|
|
||
|
(test (html->xexp "<\">") '(*TOP* "<" "\"" ">"))
|
||
|
|
||
|
(test (html->xexp (string-append "<a>xxx<plaintext>aaa" "\n"
|
||
|
"bbb" "\n"
|
||
|
"c<c<c"))
|
||
|
`(*TOP*
|
||
|
(a "xxx" (plaintext ,(string-append "aaa" "\n")
|
||
|
,(string-append "bbb" "\n")
|
||
|
"c<c<c"))))
|
||
|
|
||
|
(test (html->xexp "aaa<!-- xxx -->bbb")
|
||
|
`(*TOP*
|
||
|
"aaa" (*COMMENT* " xxx ") "bbb"))
|
||
|
|
||
|
(test (html->xexp "aaa<! -- xxx -->bbb")
|
||
|
`(*TOP*
|
||
|
"aaa" (*COMMENT* " xxx ") "bbb"))
|
||
|
|
||
|
(test (html->xexp "aaa<!-- xxx --->bbb")
|
||
|
`(*TOP*
|
||
|
"aaa" (*COMMENT* " xxx -") "bbb"))
|
||
|
|
||
|
(test (html->xexp "aaa<!-- xxx ---->bbb")
|
||
|
`(*TOP*
|
||
|
"aaa" (*COMMENT* " xxx --") "bbb"))
|
||
|
|
||
|
(test (html->xexp "aaa<!-- xxx -y-->bbb")
|
||
|
`(*TOP*
|
||
|
"aaa" (*COMMENT* " xxx -y") "bbb"))
|
||
|
|
||
|
(test (html->xexp "aaa<!----->bbb")
|
||
|
`(*TOP*
|
||
|
"aaa" (*COMMENT* "-") "bbb"))
|
||
|
|
||
|
(test (html->xexp "aaa<!---->bbb")
|
||
|
`(*TOP*
|
||
|
"aaa" (*COMMENT* "") "bbb"))
|
||
|
|
||
|
(test (html->xexp "aaa<!--->bbb")
|
||
|
`(*TOP* "aaa" (*COMMENT* "->bbb")))
|
||
|
|
||
|
(test (html->xexp "<hr>") `(*TOP* (hr)))
|
||
|
(test (html->xexp "<hr/>") `(*TOP* (hr)))
|
||
|
(test (html->xexp "<hr />") `(*TOP* (hr)))
|
||
|
|
||
|
(test (html->xexp "<hr noshade>")
|
||
|
`(*TOP* (hr (@ (noshade)))))
|
||
|
(test (html->xexp "<hr noshade/>")
|
||
|
`(*TOP* (hr (@ (noshade)))))
|
||
|
(test (html->xexp "<hr noshade />")
|
||
|
`(*TOP* (hr (@ (noshade)))))
|
||
|
(test (html->xexp "<hr noshade / >")
|
||
|
`(*TOP* (hr (@ (noshade)))))
|
||
|
(test (html->xexp "<hr noshade=1 />")
|
||
|
`(*TOP* (hr (@ (noshade "1")))))
|
||
|
(test (html->xexp "<hr noshade=1/>")
|
||
|
`(*TOP* (hr (@ (noshade "1/")))))
|
||
|
|
||
|
(test (html->xexp "<q>aaa<p/>bbb</q>ccc</p>ddd")
|
||
|
`(*TOP* (q "aaa" (p) "bbb") "ccc" "ddd"))
|
||
|
|
||
|
(test (html->xexp "<") `(*TOP* "<"))
|
||
|
(test (html->xexp ">") `(*TOP* ">"))
|
||
|
|
||
|
(test (html->xexp "Gilbert & Sullivan")
|
||
|
`(*TOP* "Gilbert & Sullivan"))
|
||
|
(test (html->xexp "Gilbert & Sullivan")
|
||
|
`(*TOP* "Gilbert & Sullivan"))
|
||
|
(test (html->xexp "Gilbert & Sullivan")
|
||
|
`(*TOP* "Gilbert & Sullivan"))
|
||
|
|
||
|
(test (html->xexp "Copyright © Foo")
|
||
|
`(*TOP* "Copyright "
|
||
|
(& copy)
|
||
|
" Foo"))
|
||
|
(test (html->xexp "aaa©bbb")
|
||
|
`(*TOP*
|
||
|
"aaa" (& copy) "bbb"))
|
||
|
|
||
|
(test 'unterminated-nonspecial-character-entity-reference-at-eof
|
||
|
(html->xexp "aaa©")
|
||
|
;; Note: 20180518 Previously, we parsed this as (*TOP* "aaa" (& copy))
|
||
|
`(*TOP* "aaa©"))
|
||
|
|
||
|
(test (html->xexp "*") '(*TOP* "*"))
|
||
|
(test (html->xexp "*") '(*TOP* "*"))
|
||
|
(test (html->xexp "*x") '(*TOP* "*x"))
|
||
|
(test (html->xexp "—") `(*TOP* ,(string (integer->char 151))))
|
||
|
(test (html->xexp "Ϩ") `(*TOP* ,(string (integer->char 1000))))
|
||
|
(test (html->xexp "B") '(*TOP* "B"))
|
||
|
(test (html->xexp "¢") `(*TOP* ,(string (integer->char 162))))
|
||
|
(test (html->xexp "ÿ") `(*TOP* ,(string (integer->char 255))))
|
||
|
(test (html->xexp "Ā") `(*TOP* ,(string (integer->char 256))))
|
||
|
(test (html->xexp "B") '(*TOP* "B"))
|
||
|
(test (html->xexp "&42;") '(*TOP* "&42;"))
|
||
|
|
||
|
(test (html->xexp (string-append "aaa©bbb&ccc<ddd&>"
|
||
|
"eee*fffϨgggZhhh"))
|
||
|
`(*TOP*
|
||
|
"aaa"
|
||
|
(& copy)
|
||
|
,(string-append "bbb&ccc<ddd&>eee*fff"
|
||
|
(string (integer->char 1000))
|
||
|
"gggZhhh")))
|
||
|
|
||
|
(test (html->xexp
|
||
|
(string-append
|
||
|
"<IMG src=\"http://e.e/aw/pics/listings/"
|
||
|
"ebayLogo_38x16.gif\" border=0 width=\"38\" height=\"16\" "
|
||
|
"HSPACE=5 VSPACE=0\">2</FONT>"))
|
||
|
`(*TOP*
|
||
|
(img (@
|
||
|
(src
|
||
|
"http://e.e/aw/pics/listings/ebayLogo_38x16.gif")
|
||
|
(border "0") (width "38") (height "16")
|
||
|
(hspace "5") (vspace "0")))
|
||
|
"2"))
|
||
|
|
||
|
(test (html->xexp "<aaa bbb=ccc\"ddd>eee")
|
||
|
`(*TOP* (aaa (@ (bbb "ccc") (ddd)) "eee")))
|
||
|
(test (html->xexp "<aaa bbb=ccc \"ddd>eee")
|
||
|
`(*TOP* (aaa (@ (bbb "ccc") (ddd)) "eee")))
|
||
|
|
||
|
(test (html->xexp
|
||
|
(string-append
|
||
|
"<HTML><Head><Title>My Title</Title></Head>"
|
||
|
"<Body BGColor=\"white\" Foo=42>"
|
||
|
"This is a <B><I>bold-italic</B></I> test of </Erk>"
|
||
|
"broken HTML.<br>Yes it is.</Body></HTML>"))
|
||
|
`(*TOP*
|
||
|
(html (head (title "My Title"))
|
||
|
(body (@ (bgcolor "white") (foo "42"))
|
||
|
"This is a "
|
||
|
(b (i "bold-italic"))
|
||
|
" test of "
|
||
|
"broken HTML."
|
||
|
(br)
|
||
|
"Yes it is."))))
|
||
|
|
||
|
(test (html->xexp
|
||
|
(string-append
|
||
|
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\""
|
||
|
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"))
|
||
|
`(*TOP*
|
||
|
(*DECL*
|
||
|
DOCTYPE
|
||
|
html
|
||
|
PUBLIC
|
||
|
"-//W3C//DTD XHTML 1.0 Strict//EN"
|
||
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd")))
|
||
|
|
||
|
(test (html->xexp
|
||
|
(string-append
|
||
|
"<html xmlns=\"http://www.w3.org/1999/xhtml\" "
|
||
|
"xml:lang=\"en\" "
|
||
|
"lang=\"en\">"))
|
||
|
`(*TOP*
|
||
|
(html (@ (xmlns "http://www.w3.org/1999/xhtml")
|
||
|
(xml:lang "en") (lang "en")))))
|
||
|
|
||
|
(test (html->xexp
|
||
|
(string-append
|
||
|
"<html:html xmlns:html=\"http://www.w3.org/TR/REC-html40\">"
|
||
|
"<html:head><html:title>Frobnostication</html:title></html:head>"
|
||
|
"<html:body><html:p>Moved to <html:a href=\"http://frob.com\">"
|
||
|
"here.</html:a></html:p></html:body></html:html>"))
|
||
|
`(*TOP*
|
||
|
(html (@ (xmlns:html "http://www.w3.org/TR/REC-html40"))
|
||
|
(head (title "Frobnostication"))
|
||
|
(body (p "Moved to "
|
||
|
(a (@ (href "http://frob.com"))
|
||
|
"here."))))))
|
||
|
|
||
|
(test (html->xexp
|
||
|
(string-append
|
||
|
"<RESERVATION xmlns:HTML=\"http://www.w3.org/TR/REC-html40\">"
|
||
|
"<NAME HTML:CLASS=\"largeSansSerif\">Layman, A</NAME>"
|
||
|
"<SEAT CLASS=\"Y\" HTML:CLASS=\"largeMonotype\">33B</SEAT>"
|
||
|
"<HTML:A HREF=\"/cgi-bin/ResStatus\">Check Status</HTML:A>"
|
||
|
"<DEPARTURE>1997-05-24T07:55:00+1</DEPARTURE></RESERVATION>"))
|
||
|
`(*TOP*
|
||
|
(reservation (@ (,(string->symbol "xmlns:HTML")
|
||
|
"http://www.w3.org/TR/REC-html40"))
|
||
|
(name (@ (class "largeSansSerif"))
|
||
|
"Layman, A")
|
||
|
(seat (@ (class "Y") (class "largeMonotype"))
|
||
|
"33B")
|
||
|
(a (@ (href "/cgi-bin/ResStatus"))
|
||
|
"Check Status")
|
||
|
(departure "1997-05-24T07:55:00+1"))))
|
||
|
|
||
|
(test (html->xexp
|
||
|
(string-append
|
||
|
"<html><head><title></title><title>whatever</title></head><body>"
|
||
|
"<a href=\"url\">link</a><p align=center><ul compact style=\"aa\">"
|
||
|
"<p>BLah<!-- comment <comment> --> <i> italic <b> bold <tt> ened </i>"
|
||
|
" still < bold </b></body><P> But not done yet..."))
|
||
|
`(*TOP*
|
||
|
(html (head (title) (title "whatever"))
|
||
|
(body (a (@ (href "url")) "link")
|
||
|
(p (@ (align "center"))
|
||
|
(ul (@ (compact) (style "aa"))))
|
||
|
(p "BLah"
|
||
|
(*COMMENT* " comment <comment> ")
|
||
|
" "
|
||
|
(i " italic " (b " bold " (tt " ened ")))
|
||
|
" still < bold "))
|
||
|
(p " But not done yet..."))))
|
||
|
|
||
|
(test (html->xexp "<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
|
||
|
`(*TOP*
|
||
|
(*PI* xml "version=\"1.0\" encoding=\"UTF-8\"")))
|
||
|
|
||
|
(test (html->xexp "<?php php_info(); ?>")
|
||
|
`(*TOP* (*PI* php "php_info(); ")))
|
||
|
(test (html->xexp "<?php php_info(); ?")
|
||
|
`(*TOP* (*PI* php "php_info(); ?")))
|
||
|
(test (html->xexp "<?php php_info(); ")
|
||
|
`(*TOP* (*PI* php "php_info(); ")))
|
||
|
|
||
|
(test (html->xexp "<?foo bar ? baz > blort ?>")
|
||
|
`(*TOP*
|
||
|
(*PI* foo "bar ? baz > blort ")))
|
||
|
|
||
|
(test (html->xexp "<?foo b?>x")
|
||
|
`(*TOP* (*PI* foo "b") "x"))
|
||
|
(test (html->xexp "<?foo ?>x")
|
||
|
`(*TOP* (*PI* foo "") "x"))
|
||
|
(test (html->xexp "<?foo ?>x")
|
||
|
`(*TOP* (*PI* foo "") "x"))
|
||
|
(test (html->xexp "<?foo?>x")
|
||
|
`(*TOP* (*PI* foo "") "x"))
|
||
|
(test (html->xexp "<?f?>x")
|
||
|
`(*TOP* (*PI* f "") "x"))
|
||
|
(test (html->xexp "<??>x")
|
||
|
`(*TOP* (*PI* #f "") "x"))
|
||
|
(test (html->xexp "<?>x")
|
||
|
`(*TOP* (*PI* #f ">x")))
|
||
|
|
||
|
(test (html->xexp "<foo bar=\"baz\">blort")
|
||
|
`(*TOP* (foo (@ (bar "baz")) "blort")))
|
||
|
(test (html->xexp "<foo bar='baz'>blort")
|
||
|
`(*TOP* (foo (@ (bar "baz")) "blort")))
|
||
|
(test (html->xexp "<foo bar=\"baz'>blort")
|
||
|
`(*TOP* (foo (@ (bar "baz'>blort")))))
|
||
|
(test (html->xexp "<foo bar='baz\">blort")
|
||
|
`(*TOP* (foo (@ (bar "baz\">blort")))))
|
||
|
|
||
|
(test (html->xexp (string-append "<p>A</p>"
|
||
|
"<script>line0 <" "\n"
|
||
|
"line1" "\n"
|
||
|
"<line2></script>"
|
||
|
"<p>B</p>"))
|
||
|
`(*TOP* (p "A")
|
||
|
(script ,(string-append "line0 <" "\n")
|
||
|
,(string-append "line1" "\n")
|
||
|
"<line2>")
|
||
|
(p "B")))
|
||
|
|
||
|
(test (html->xexp "<xmp>a<b>c</XMP>d")
|
||
|
`(*TOP* (xmp "a<b>c") "d"))
|
||
|
(test (html->xexp "<XMP>a<b>c</xmp>d")
|
||
|
`(*TOP* (xmp "a<b>c") "d"))
|
||
|
(test (html->xexp "<xmp>a<b>c</foo:xmp>d")
|
||
|
`(*TOP* (xmp "a<b>c") "d"))
|
||
|
(test (html->xexp "<foo:xmp>a<b>c</xmp>d")
|
||
|
`(*TOP* (xmp "a<b>c") "d"))
|
||
|
(test (html->xexp "<foo:xmp>a<b>c</foo:xmp>d")
|
||
|
`(*TOP* (xmp "a<b>c") "d"))
|
||
|
(test (html->xexp "<foo:xmp>a<b>c</bar:xmp>d")
|
||
|
`(*TOP* (xmp "a<b>c") "d"))
|
||
|
|
||
|
(test (html->xexp "<xmp>a</b>c</xmp>d")
|
||
|
`(*TOP* (xmp "a</b>c") "d"))
|
||
|
(test (html->xexp "<xmp>a</b >c</xmp>d")
|
||
|
`(*TOP* (xmp "a</b >c") "d"))
|
||
|
(test (html->xexp "<xmp>a</ b>c</xmp>d")
|
||
|
`(*TOP* (xmp "a</ b>c") "d"))
|
||
|
(test (html->xexp "<xmp>a</ b >c</xmp>d")
|
||
|
`(*TOP* (xmp "a</ b >c") "d"))
|
||
|
(test (html->xexp "<xmp>a</b:x>c</xmp>d")
|
||
|
`(*TOP* (xmp "a</b:x>c") "d"))
|
||
|
(test (html->xexp "<xmp>a</b::x>c</xmp>d")
|
||
|
`(*TOP* (xmp "a</b::x>c") "d"))
|
||
|
(test (html->xexp "<xmp>a</b:::x>c</xmp>d")
|
||
|
`(*TOP* (xmp "a</b:::x>c") "d"))
|
||
|
(test (html->xexp "<xmp>a</b:>c</xmp>d")
|
||
|
`(*TOP* (xmp "a</b:>c") "d"))
|
||
|
(test (html->xexp "<xmp>a</b::>c</xmp>d")
|
||
|
`(*TOP* (xmp "a</b::>c") "d"))
|
||
|
(test (html->xexp "<xmp>a</xmp:b>c</xmp>d")
|
||
|
`(*TOP* (xmp "a</xmp:b>c") "d"))
|
||
|
|
||
|
(let ((expected `(*TOP* (p "real1")
|
||
|
"\n"
|
||
|
(xmp "\n"
|
||
|
,(string-append "alpha" "\n")
|
||
|
,(string-append "<P>fake</P>" "\n")
|
||
|
,(string-append "bravo" "\n"))
|
||
|
(p "real2"))))
|
||
|
|
||
|
(test (html->xexp (string-append "<P>real1</P>" "\n"
|
||
|
"<XMP>" "\n"
|
||
|
"alpha" "\n"
|
||
|
"<P>fake</P>" "\n"
|
||
|
"bravo" "\n"
|
||
|
"</XMP " "\n"
|
||
|
"<P>real2</P>"))
|
||
|
expected)
|
||
|
|
||
|
(test (html->xexp (string-append "<P>real1</P>" "\n"
|
||
|
"<XMP>" "\n"
|
||
|
"alpha" "\n"
|
||
|
"<P>fake</P>" "\n"
|
||
|
"bravo" "\n"
|
||
|
"</XMP" "\n"
|
||
|
"<P>real2</P>"))
|
||
|
expected))
|
||
|
|
||
|
(test (html->xexp "<xmp>a</xmp>x")
|
||
|
`(*TOP* (xmp "a") "x"))
|
||
|
(test (html->xexp (string-append "<xmp>a" "\n" "</xmp>x"))
|
||
|
`(*TOP* (xmp ,(string-append "a" "\n")) "x"))
|
||
|
(test (html->xexp "<xmp></xmp>x")
|
||
|
`(*TOP* (xmp) "x"))
|
||
|
|
||
|
(test (html->xexp "<xmp>a</xmp") `(*TOP* (xmp "a")))
|
||
|
(test (html->xexp "<xmp>a</xm") `(*TOP* (xmp "a</xm")))
|
||
|
(test (html->xexp "<xmp>a</x") `(*TOP* (xmp "a</x")))
|
||
|
(test (html->xexp "<xmp>a</") `(*TOP* (xmp "a</")))
|
||
|
(test (html->xexp "<xmp>a<") `(*TOP* (xmp "a<")))
|
||
|
(test (html->xexp "<xmp>a") `(*TOP* (xmp "a")))
|
||
|
(test (html->xexp "<xmp>") `(*TOP* (xmp)))
|
||
|
(test (html->xexp "<xmp") `(*TOP* (xmp)))
|
||
|
|
||
|
(test (html->xexp "<xmp x=42 ")
|
||
|
`(*TOP* (xmp (@ (x "42")))))
|
||
|
(test (html->xexp "<xmp x= ") `(*TOP* (xmp (@ (x)))))
|
||
|
(test (html->xexp "<xmp x ") `(*TOP* (xmp (@ (x)))))
|
||
|
(test (html->xexp "<xmp x") `(*TOP* (xmp (@ (x)))))
|
||
|
|
||
|
(test (html->xexp "<script>xxx")
|
||
|
`(*TOP* (script "xxx")))
|
||
|
(test (html->xexp "<script/>xxx")
|
||
|
`(*TOP* (script) "xxx"))
|
||
|
|
||
|
(test (html->xexp "<html xml:lang=\"en\" lang=\"en\">")
|
||
|
`(*TOP* (html (@ (xml:lang "en") (lang "en")))))
|
||
|
|
||
|
(test (html->xexp "<a href=/foo.html>")
|
||
|
`(*TOP* (a (@ (href "/foo.html")))))
|
||
|
(test (html->xexp "<a href=/>foo.html")
|
||
|
`(*TOP* (a (@ (href "/")) "foo.html")))
|
||
|
|
||
|
;; TODO: Add verbatim-pair cases with attributes in the end tag.
|
||
|
|
||
|
(test (html->xexp "©")
|
||
|
`(*TOP* (& copy)))
|
||
|
(test (html->xexp "⇒")
|
||
|
`(*TOP* (& ,(string->symbol "rArr"))))
|
||
|
(test (html->xexp "—")
|
||
|
`(*TOP* ,(string (integer->char 151))))
|
||
|
|
||
|
(test (html->xexp "ϧ")
|
||
|
`(*TOP* ,(string (integer->char 999))))
|
||
|
|
||
|
(test (html->xexp "xxx<![CDATA[abc]]>yyy")
|
||
|
`(*TOP* "xxx" "abc" "yyy"))
|
||
|
|
||
|
(test (html->xexp "xxx<![CDATA[ab]c]]>yyy")
|
||
|
`(*TOP* "xxx" "ab]c" "yyy"))
|
||
|
|
||
|
(test (html->xexp "xxx<![CDATA[ab]]c]]>yyy")
|
||
|
`(*TOP* "xxx" "ab]]c" "yyy"))
|
||
|
|
||
|
(test (html->xexp "xxx<![CDATA[]]]>yyy")
|
||
|
`(*TOP* "xxx" "]" "yyy"))
|
||
|
|
||
|
(test (html->xexp "xxx<![CDATAyyy")
|
||
|
`(*TOP* "xxx" "<![CDATA" "yyy"))
|
||
|
|
||
|
(test (html->xexp "<html><div><p>P1</p><p>P2</p></div><p>P3</p>")
|
||
|
`(*TOP* (html (div (p "P1")
|
||
|
(p "P2"))
|
||
|
(p "P3")))
|
||
|
#:id 'parent-constraints-with-div)
|
||
|
|
||
|
(test (html->xexp "—")
|
||
|
`(*TOP* ,(string (integer->char 151)))
|
||
|
#:id 'once-again-converting-character-references-above-126-to-string)
|
||
|
|
||
|
(test (html->xexp "<ul><li>a<p>b</p>")
|
||
|
`(*TOP* (ul (li "a" (p "b"))))
|
||
|
#:id 'p-element-can-be-child-of-li-element)
|
||
|
|
||
|
(test-section 'unterminated-named-character-entity-references
|
||
|
|
||
|
(test-section 'old-school-names
|
||
|
(test 'amp (html->xexp "<p>a&z</p>") '(*TOP* (p "a&z")))
|
||
|
(test 'apos (html->xexp "<p>a&aposz</p>") '(*TOP* (p "a'z")))
|
||
|
(test 'lt (html->xexp "<p>a<z</p>") '(*TOP* (p "a<z")))
|
||
|
(test 'gt (html->xexp "<p>a>z</p>") '(*TOP* (p "a>z")))
|
||
|
(test 'quot (html->xexp "<p>a"z</p>") '(*TOP* (p "a\"z"))))
|
||
|
|
||
|
(test-section 'new-school-names
|
||
|
(test 'rarr (html->xexp "<p>a&rarrz</p>") '(*TOP* (p "a&rarrz"))))
|
||
|
|
||
|
(test-section 'unrecognized-names
|
||
|
(test 'a-t-and-t
|
||
|
(html->xexp "<p>AT&T Bell Labs</p>")
|
||
|
'(*TOP* (p "AT&T Bell Labs")))))
|
||
|
|
||
|
(test-section 'terminated-old-school-named-character-entity-references
|
||
|
(test 'amp (html->xexp "<p>a&z</p>") '(*TOP* (p "a&z")))
|
||
|
(test 'apos (html->xexp "<p>a'z</p>") '(*TOP* (p "a'z")))
|
||
|
(test 'lt (html->xexp "<p>a<z</p>") '(*TOP* (p "a<z")))
|
||
|
(test 'gt (html->xexp "<p>a>z</p>") '(*TOP* (p "a>z")))
|
||
|
(test 'quot (html->xexp "<p>a"z</p>") '(*TOP* (p "a\"z"))))
|
||
|
|
||
|
(test-section 'parent-constraints-should-permit-p-inside-blockquote
|
||
|
(test 'basic
|
||
|
(html->xexp "<x>A<blockquote><p>B</p>C<p>D</p>E</blockquote>F</x>")
|
||
|
`(*TOP* (x "A"
|
||
|
(blockquote (p "B")
|
||
|
"C"
|
||
|
(p "D")
|
||
|
"E")
|
||
|
"F")))
|
||
|
(test 'initial-sorawee-porncharoenwase-example
|
||
|
(html->xexp
|
||
|
"<blockquote><tr><td><blockquote><p></p></blockquote></td><div></div></tr></blockquote>")
|
||
|
'(*TOP* (blockquote (tr (td (blockquote (p)))
|
||
|
(div))))))
|
||
|
|
||
|
(test-section 'p-elem-can-be-child-of-details-elem
|
||
|
|
||
|
(test 'initial-jacder-example-modified
|
||
|
(html->xexp "<div><details><p>text in details</p></details></div>")
|
||
|
'(*TOP* (div (details (p "text in details"))))))
|
||
|
|
||
|
;; TODO: 2022-01-22 Go through latest HTML elements and add them to parent constraints
|
||
|
;; in the ancient quirks handling from over 20 years ago.
|
||
|
|
||
|
(test-section 'p-elem-cannot-be-child-of-p-elem
|
||
|
|
||
|
(test 'initial-simpson-example
|
||
|
(html->xexp "<html><p>foo<p>bar</html>")
|
||
|
'(*TOP* (html (p "foo") (p "bar"))))
|
||
|
|
||
|
(test 'non-p-parent-in-which-p-shouldnt-nest
|
||
|
(html->xexp "<html><xyz>foo<p>bar</html>")
|
||
|
'(*TOP* (html (xyz "foo") (p "bar"))))
|
||
|
|
||
|
(test 'p-shouldnt-nest-even-without-html-element
|
||
|
(html->xexp "<p>foo<p>bar")
|
||
|
'(*TOP* (p "foo") (p "bar"))
|
||
|
#:fail "see code comment about parent-constraints when no top elements"))
|
||
|
|
||
|
(test-section 'break-the-world-with-h-elem-mid-90s-handling
|
||
|
|
||
|
(test 'initial
|
||
|
(html->xexp "<html><p>foo<h3>bar</h3></html>")
|
||
|
'(*TOP* (html (p "foo") (h3 "bar")))))
|
||
|
|
||
|
(test-section 'twrpme-h2-in-li-elements
|
||
|
|
||
|
(test 'simple
|
||
|
(html->xexp "<html><body><ul><li><h2>My Header Item</h2></li><li>My Non-Header Item</li></ul></body></html>")
|
||
|
'(*TOP* (html (body (ul (li (h2 "My Header Item"))
|
||
|
(li "My Non-Header Item"))))))
|
||
|
|
||
|
(test 'simon-budinsky-example
|
||
|
(html->xexp
|
||
|
(string-append
|
||
|
"<html>"
|
||
|
"<head></head>"
|
||
|
"<body>"
|
||
|
"<ul class=\"post-list\">"
|
||
|
"<li>"
|
||
|
"<span class=\"post-meta\">Mar 10, 2022</span>"
|
||
|
"<h2>"
|
||
|
"<a class=\"post-link\" href=\"/site/update/2022/03/10/twrp-3.6.1-released.html\">TWRP 3.6.1 Released</a>"
|
||
|
"</h2>"
|
||
|
"</li>"
|
||
|
"<li>"
|
||
|
"<span class=\"post-meta\">Mar 10, 2022</span>"
|
||
|
"<h2>"
|
||
|
"<a class=\"post-link\" href=\"/site/update/2022/03/10/twrp-3.6.1-released.html\">TWRP 3.6.1 Released</a>"
|
||
|
"</h2>"
|
||
|
"</li>"
|
||
|
"</ul>"
|
||
|
"</body>"
|
||
|
"</html>"))
|
||
|
'(*TOP* (html (head)
|
||
|
(body (ul (@ (class "post-list"))
|
||
|
(li (span (@ (class "post-meta"))
|
||
|
"Mar 10, 2022")
|
||
|
(h2 (a (@ (class "post-link")
|
||
|
(href "/site/update/2022/03/10/twrp-3.6.1-released.html"))
|
||
|
"TWRP 3.6.1 Released")))
|
||
|
|
||
|
(li (span (@ (class "post-meta"))
|
||
|
"Mar 10, 2022")
|
||
|
(h2 (a (@ (class "post-link")
|
||
|
(href "/site/update/2022/03/10/twrp-3.6.1-released.html"))
|
||
|
"TWRP 3.6.1 Released")))))))))
|
||
|
|
||
|
(test-section 'area-in-span-in-microformats
|
||
|
|
||
|
(test 'area-from-jacob-hall
|
||
|
(html->xexp "<div><span><area></area></span></div>")
|
||
|
'(*TOP* (div (span (area)))))
|
||
|
|
||
|
(test 'object-from-jacob-hall
|
||
|
(html->xexp "<div><object>Jane Doe</object></div>")
|
||
|
'(*TOP* (div (object "Jane Doe")))))
|
||
|
|
||
|
(test-section 'bugs-from-cadence
|
||
|
|
||
|
(test 'figcaption-as-parent-of-p
|
||
|
(html->xexp "<figcaption> <p class=\"caption\">words</p><img src=\"nowhere\"></figcaption>")
|
||
|
'(*TOP* (figcaption " "
|
||
|
(p (@ (class "caption")) "words")
|
||
|
(img (@ (src "nowhere"))))))
|
||
|
|
||
|
(test 'attribute-html-entities-1
|
||
|
(html->xexp "<img src=\"https://example.com/images?src=Blah.jpg&width=150\">")
|
||
|
'(*TOP* (img (@ (src "https://example.com/images?src=Blah.jpg&width=150")))))
|
||
|
|
||
|
(test 'attribute-html-entities-2
|
||
|
(html->xexp "<a href=\"https://example.com/phpbb/viewtopic.php?f=31&t=1125\">https://example.com/phpbb/viewtopic.php?f=31&t=1125</a>")
|
||
|
'(*TOP* (a (@ (href "https://example.com/phpbb/viewtopic.php?f=31&t=1125"))
|
||
|
"https://example.com/phpbb/viewtopic.php?f=31&t=1125"))))
|
||
|
|
||
|
;; TODO: Document this.
|
||
|
;;
|
||
|
;; (define html-1 "<myelem myattr=\"&\">")
|
||
|
;; (define shtml (html->xexp html-1))
|
||
|
;; shtml
|
||
|
;; (define html-2 (shtml->html shtml))
|
||
|
;; html-2
|
||
|
)
|
||
|
|
||
|
(doc history
|
||
|
|
||
|
(#:planet 12:0 #:date "2023-01-28"
|
||
|
(itemlist
|
||
|
(item "Within attributes, HTML entities like & are now decoded.")
|
||
|
(item "A "
|
||
|
(code "p")
|
||
|
" element can now appear as a child of a "
|
||
|
(code "figcaption")
|
||
|
" element.")))
|
||
|
|
||
|
(#:planet 11:0 #:date "2022-07-19"
|
||
|
(itemlist
|
||
|
(item "An "
|
||
|
(code "object")
|
||
|
" element is no longer considered always-empty. Incrementing major
|
||
|
version again, because this could break parses.")))
|
||
|
|
||
|
(#:planet 10:0 #:date "2022-07-19"
|
||
|
(itemlist
|
||
|
(item "To pass a \"microformats\" test suite ("
|
||
|
(hyperlink "https://github.com/microformats/tests/blob/master/tests/microformats-v2/h-card/impliedname.html#L11"
|
||
|
"impliedname.html")
|
||
|
"), an "
|
||
|
(code "area")
|
||
|
" element can now be a child of a "
|
||
|
(code "span")
|
||
|
" element. In the future, we might be even more flexible about where "
|
||
|
(code "span")
|
||
|
" elements are permitted. (Thanks to Jacob Hall for
|
||
|
discussing.)")))
|
||
|
|
||
|
(#:planet 9:0 #:date "2022-04-16"
|
||
|
(itemlist
|
||
|
(item "Header elements may once again appear as children of "
|
||
|
(code "li")
|
||
|
" elements (which we broke in the previous version), as we see how far we can stretch a 20 year-old hack for invalid HTML. (Thanks for Simon Budinsky for reporting.)")))
|
||
|
|
||
|
(#:planet 8:0 #:date "2022-04-03"
|
||
|
(itemlist
|
||
|
(item "The original \"H\" elements ("
|
||
|
(code "h1")
|
||
|
", "
|
||
|
(code "h2")
|
||
|
", etc.) now are parsed with \"parent constraints\" for
|
||
|
handling invalid HTML, to accommodate a need to parse mid-1990s HTML in which "
|
||
|
(code "<P>")
|
||
|
" was used as a separator or terminator, rather than a
|
||
|
start delimeter. There is a chance that change this will break a real-world
|
||
|
scraper or other tool.")))
|
||
|
|
||
|
(#:planet 7:1 #:date "2022-04-02"
|
||
|
(itemlist
|
||
|
(item "Include a test case #:fail that was unsaved in DrRacket.")))
|
||
|
|
||
|
(#:planet 7:0 #:date "2022-04-02"
|
||
|
(itemlist
|
||
|
|
||
|
(item "Fixed a quirks-handling bug in which "
|
||
|
(code "p")
|
||
|
" elements would be (directly or indirectly) nested under other "
|
||
|
(code "p")
|
||
|
" elements in cases in which there was no "
|
||
|
(code "body")
|
||
|
" element, but there was an "
|
||
|
(code "html")
|
||
|
" element. (Thanks to Jonathan Simpson for reporting.)")))
|
||
|
|
||
|
(#:planet 6:1 #:date "2022-01-22"
|
||
|
(itemlist
|
||
|
(item "Permit "
|
||
|
(code "details")
|
||
|
" element to be parent of "
|
||
|
(code "p")
|
||
|
" element in quirks handling. (Thanks to Jacder for reporting.)")))
|
||
|
|
||
|
(#:planet 6:0 #:date "2018-05-22"
|
||
|
(itemlist
|
||
|
(item "Fix to permit "
|
||
|
(code "p")
|
||
|
" elements as children of "
|
||
|
(code "blockquote")
|
||
|
" elements. Incrementing major version number because
|
||
|
this is a breaking change of 17 years, but seems an appropriate change for
|
||
|
modern HTML, and fixes a particular real-world problem. (Thanks to
|
||
|
Sorawee Porncharoenwase for reporting.)")))
|
||
|
|
||
|
(#:planet 5:0 #:date "2018-05-15"
|
||
|
(itemlist
|
||
|
(item "In a breaking change of handing invalid HTML, most named
|
||
|
character entity references that are invalid because (possibly among multiple
|
||
|
reasons) they are not terminated by semicolon, now are treated as literal
|
||
|
strings (including the ampersand), rather than as named character entites. For
|
||
|
example, parser input string "
|
||
|
(racket "<p>A&B Co.<p>")
|
||
|
" will now parse as "
|
||
|
(racket (p "A&B Co."))
|
||
|
" rather than as "
|
||
|
(racket (p "A" (& B) " Co."))
|
||
|
". (Thanks for Greg Hendershott for suggesting this, and discussing.)")
|
||
|
(item "For support of historical quirks handling, five early
|
||
|
HTML named character entities (specifically, "
|
||
|
(tt "amp")
|
||
|
", "
|
||
|
(tt "apos")
|
||
|
", "
|
||
|
(tt "lt")
|
||
|
", "
|
||
|
(tt "gt")
|
||
|
", "
|
||
|
(tt "quot")
|
||
|
") do not need to be terminated with a semicolon, and
|
||
|
will even be recognized if followed immediately by an alphabetic. For
|
||
|
example, "
|
||
|
(racket "<p>a<z</p>")
|
||
|
" will now parse as "
|
||
|
(racket (p "a<z"))
|
||
|
", rather than as "
|
||
|
(racket (p (& ltz)))
|
||
|
".")
|
||
|
(item "Invalid character entity references that are terminated
|
||
|
by EOF rather than semicolon may now be parsed as literal strings, rather than
|
||
|
as entity references.")))
|
||
|
|
||
|
(#:planet 4:3 #:date "2016-12-15"
|
||
|
(itemlist
|
||
|
(item "Error message ``"
|
||
|
(code "%html-parsing:parse-html: invalid input type:")
|
||
|
"'' now abbreviates the invalid value, to avoid possibly
|
||
|
huge messages. (Thanks to John B. Clements.)")))
|
||
|
|
||
|
(#:planet 4:2 #:date "2016-03-02"
|
||
|
(itemlist
|
||
|
(item "Tweaked info.rkt, filenames.")))
|
||
|
|
||
|
(#:planet 4:1 #:date "2016-02-25"
|
||
|
(itemlist
|
||
|
(item "Updated deps.")
|
||
|
(item "Documentation tweaks.")))
|
||
|
|
||
|
(#:planet 4:0 #:date "2016-02-21"
|
||
|
(itemlist
|
||
|
(item "Moving from PLaneT to new package system.")
|
||
|
(item "Moved unit tests into main source file.")))
|
||
|
|
||
|
(#:planet 3:0 #:date "2015-04-24"
|
||
|
(itemlist
|
||
|
(item "Numeric character entities now parse to Racket strings
|
||
|
instead of Racket characters, to bring SXML/xexp back closer to SXML. (Thanks
|
||
|
to John Clements for reporting.)")))
|
||
|
|
||
|
(#:planet 2:0 #:date "2012-06-13"
|
||
|
(itemlist
|
||
|
(item "Converted to McFly.")))
|
||
|
|
||
|
(#:version "0.3" #:planet 1:2 #:date "2011-08-27"
|
||
|
(itemlist
|
||
|
(item "Converted test suite from Testeez to Overeasy.")))
|
||
|
|
||
|
(#:version "0.2" #:planet 1:1 #:date "2011-08-27"
|
||
|
(itemlist
|
||
|
(item "Fixed embarrassing bug due to code tidying. (Thanks to
|
||
|
Danny Yoo for reporting.)")))
|
||
|
|
||
|
(#:version "0.1" #:planet 1:0 #:date "2011-08-21"
|
||
|
(itemlist
|
||
|
(item "Part of forked development from HtmlPrag, parser
|
||
|
originally written 2001-04."))))
|