#!/usr/bin/env newlisp ;; a port of John Gruber's Markdown.pl (http://daringfireball.net/markdown) script to newLISP... ;; see his original Perl script for explanations of the fearsome regexen and ;; byzantine logic, etc... ;; TODO: ;; the following Markdown tests give different results: ;; nested brackets (see Links, reference style.text) ;; backticks in html tags (see test file Code Spans.text) ;; parens in url : ![this is a stupid URL](http://example.com/(parens).jpg) see (Images.text) ;; a nested ordered list error (see Ordered and unordered lists.text) ;; some odd backslash escapes in Backslash escapes.text ;; Add: email address scrambling ;; ;; version date 2008-08-08 16:54:56 ;; changed (unless to (if (not ... ;; ;; version date 2008-03-07 15:36:09 ;; fixed load error ;; ;; version date 2007-11-17 16:20:57 ;; added syntax colouring module ;; ;; version date 2007-11-14 09:19:42 ;; removed reliance on dostring for compatibility with 9.1 ;; author: cormullion (if (not (context? Pretty)) (load {tokenizer.lsp} {prettyprint-css.lsp})) (context 'markdown) (define (markdown:markdown txt) (initialize) (unescape-special-chars (block-transforms (strip-link-definitions (protect (cleanup txt)))))) ; initialize (define (initialize) (set '*escape-chars* [text]\`*_{}[]()>#+-.![/text]) (set '*escape-pairs* '( ([text]\\\\[/text] [text]\\[/text]) ([text]\\`[/text] [text]`[/text]) ([text]\\\*[/text] [text]*[/text]) ([text]\\_[/text] [text]_[/text]) ([text]\\\{[/text] [text]{[/text]) ([text]\\\}[/text] [text]}[/text]) ([text]\\\[[/text] [text][[/text]) ([text]\\\][/text] [text]][/text]) ([text]\\\([/text] [text]([/text]) ([text]\\\)[/text] [text])[/text]) ([text]\\>[/text] [text]>[/text]) ([text]\\\#[/text] [text]#[/text]) ([text]\\\+[/text] [text]+[/text]) ([text]\\\-[/text] [text]-[/text]) ([text]\\\.[/text] [text].[/text]) ([text]\\![/text] [text]![/text]) )) (set '*hashed-html-blocks* '()) (build-escape-table) (set '*list-level* 0)) (define (block-transforms txt) (form-paragraphs (protect (block-quotes (code-blocks (lists (horizontal-rules (headers txt)))))))) (define (span-transforms txt) (line-breaks (emphasis (amps-and-angles (auto-links (anchors (images (escape-special-chars (escape-special-chars-within-tag-attributes (code-spans txt)))))))))) (define (hash s) (base64-enc (uuid))) (define (build-escape-table) (set '*escape-table* '()) (dolist (c (explode *escape-chars*)) (push (list c (hash c)) *escape-table*))) (define (tokenize-html xhtml) ; return list of tag/text portions of xhtml text (letn ( (tag-match [text]((?s:)| (?s:<\?.*?\?>)| (?:<[a-z/!$](?:[^<>]| (?:<[a-z/!$](?:[^<>]| (?:<[a-z/!$](?:[^<>]| (?:<[a-z/!$](?:[^<>]| (?:<[a-z/!$](?:[^<>]| (?:<[a-z/!$](?:[^<>])*>))*>))*>))*>))*>))*>))[/text]) ; yeah, well... (str xhtml) (len (length str)) (pos 0) (tokens '()) ) (while (set 'tag-start (find tag-match str 8)) (if (< pos tag-start) (push (list text (slice str pos (- tag-start pos))) tokens -1)) (push (list tag $0) tokens -1) (set 'str (slice str (+ tag-start (length $0)))) (set 'pos 0)) ; leftovers (if (< pos len) (push (list (quote text) (slice str pos (- len pos))) tokens -1)) tokens) ) (define (escape-special-chars-within-tag-attributes txt) (let ((temp (tokenize-html txt)) (new-text {}) ) (dolist (pair temp) (if (= (first pair) 'tag) ; 'tag (begin (set 'new-text (replace {\\} (last pair) (lookup {\\} *escape-table*) 0)) (replace [text](?<=.)(?=.)[/text] new-text (lookup {`} *escape-table*) 0) (replace {\*} new-text (lookup {*} *escape-table*) 0) (replace {_} new-text (lookup {_} *escape-table*) 0) ) ; 'text (set 'new-text (last pair)) ) (nth-set (temp $idx) (list (first pair) new-text))) (join (map last temp)) ; return as text ) ) (define (escape-special-chars txt ) ; replaces characters in tags and text tokens (let ((temp (tokenize-html txt)) (new-text {}) ) (dolist (pair temp) (if (= (first pair) 'tag) ; 'tag (begin (set 'new-text (replace {\\} (last pair) (lookup {\\} *escape-table*) 0)) (replace [text](?<=.)(?=.)[/text] new-text (lookup {`} *escape-table*) 0) (replace {\*} new-text (lookup {*} *escape-table*) 0) (replace {_} new-text (lookup {_} *escape-table*) 0) ) ; 'text (begin (set 'new-text (encode-backslash-escapes (last pair))))) (nth-set (temp $idx) (list (first pair) new-text))) (join (map last temp)) ; return as text ) ) (define (encode-backslash-escapes t) (dolist (pair *escape-pairs*) (replace (first pair) t (lookup (last pair) *escape-table*) 14)) t) (define (encode-code s) ; encode/escape certain characters inside Markdown code runs (replace {&} s "&" 0) (replace {<} s "<" 0) (replace {>} s ">" 0) (replace {\*} s (lookup {*} *escape-table*) 0) (replace {_} s (lookup {_} *escape-table*) 0) (replace "{" s (lookup "{" *escape-table*) 0) (replace {\[} s (lookup {[} *escape-table*) 0) (replace {\]} s (lookup {]} *escape-table*) 0) (replace {\\} s (lookup {\\} *escape-table*) 0) s) (define (code-spans s) (replace {(?} (encode-code (trim $2)) {}) 2 ) s) (define (encode-alt s) (replace {&} s "&" 0) (replace {"} s """ 0) s) (define (images txt) (let ((alt-text {}) (url {} ) (title {}) (ref-regex {(!\[(.*?)\][ ]?(?:\n[ ]*)?\[(.*?)\])}) (inline-regex {(!\[(.*?)\]\([ \t]*?[ \t]*((['"])(.*?)\5[ \t]*)?\))}) (whole-match {}) (result {}) (id-ref {}) (url {}) ) ; reference links ![alt text][id] (replace ref-regex txt (begin (set 'whole-match $1 'alt-text $2 'id-ref $3) (if alt-text (replace {"} alt-text {"} 0)) (if (empty? id-ref) (set 'id-ref (lower-case alt-text))) (if (lookup id-ref *link-database*) (set 'url (first (lookup id-ref *link-database*))) (set 'url nil)) (if url (begin (replace {\*} url (lookup {*} *escape-table*) 0) (replace {_} url (lookup {_} *escape-table*) 0) )) (if (last (lookup id-ref *link-database*)) ; title (begin (set 'title (last (lookup id-ref *link-database*))) (replace {"} title {"} 0) (replace {\*} title (lookup {*} *escape-table*) 0) (replace {_} title (lookup {_} *escape-table*) 0)) ; no title (set 'title {}) ) (if url (set 'result (string {} 
          alt-text {})) (set 'result whole-match)) ) 0 ) ; inline image refs: ![alt text](url "optional title") (replace inline-regex txt (begin (set 'whole-match $1) (set 'alt-text $2) (set 'url $3) (set 'title $6) (if alt-text (replace {"} alt-text {"} 0) (set 'alt-text {})) (if title (begin (replace {"} title {"} 0) (replace {\*} title (lookup {*} *escape-table*) 0) (replace {_} title (lookup {_} *escape-table*) 0)) (set 'title {})) (replace {\*} url (lookup {*} *escape-table*) 0) (replace {_} url (lookup {_} *escape-table*) 0) (string {} 
           alt-text {}) ) 0 ) ; empty ones are possible (set '$1 {}) (replace {!\[(.*?)\]\([ \t]*\)} txt (string {} $1 {}) 0 ) ) txt) (define (make-anchor link-text id-ref ) ; Link defs are in the form: ^[id]: url "optional title" ; stored in link db list as (id (url title)) ; params are text to be linked and the id of the link in the db ; eg bar 1 for [bar][1] (let ( (title {}) (id id-ref) (url nil) ) (if link-text (begin (replace {"} link-text {"} 0) (replace {\n} link-text { } 0) (replace {[ ]?\n} link-text { } 0) )) (if (null? id ) (set 'id (lower-case link-text))) (if (not (nil? (lookup id *link-database*))) (begin (set 'url (first (lookup id *link-database*))) (replace {\*} url (lookup {*} *escape-table*) 0) (replace {_} url (lookup {_} *escape-table*) 0) (if (set 'title (last (lookup id *link-database*))) (begin (replace {"} title {"} 0) (replace {\*} title (lookup {*} *escape-table*) 0) (replace {_} title (lookup {_} *escape-table*) 0)) (set 'title {}))) (set 'url nil)) (if url (string {} link-text {}) (string {[} link-text {][} id-ref {]})) ) ) (define (anchors txt) (letn ((nested-brackets {(?>[^\[\]]+)*}) (ref-link-regex (string {(\[(} nested-brackets {)\][ ]?(?:\n[ ]*)?\[(.*?)\])})) (inline-regex {(\[(.*?)\]\([ ]*?[ ]*((['"])(.*?)\5[ \t]*)?\))}) (link-text {}) (url {}) (title {}) ) ; reference-style links: [link text] [id] (set '$1 {} '$2 {} '$3 {} '$4 {} '$5 {} '$6 {}) ; i still don't think I should have to do this... (replace ref-link-regex txt (make-anchor $2 $3) 8) ; $2 is link text, $3 is id ; inline links: [link text](url "optional title") (set '$1 {} '$2 {} '$3 {} '$4 {} '$5 {} '$6 {}) (replace inline-regex txt (begin (set 'link-text $2) (set 'url $3) (set 'title $6) (if link-text (replace {"} link-text {"} 0)) (if title (begin (replace {"} title {"} 0) (replace {\*} title (lookup {*} *escape-table*) 0) (replace {_} title (lookup {_} *escape-table*) 0)) (set 'title {})) (replace {\*} url (lookup {*} *escape-table*) 0) (replace {_} url (lookup {_} *escape-table*) 0) (replace {^<(.*)>$} url $1 0) (string {} link-text {} )) 8 ) ; replace ) txt) (define (auto-links txt) (replace [text]<((https?|ftp):[^'">\s]+)>[/text] txt (string {} $1 {}) 0 ) ; to-do: email ... txt) (define (amps-and-angles txt) ; Smart processing for ampersands and angle brackets (replace [text]&(?!\#?[xX]?(?:[0-9a-fA-F]+|\w+);)[/text] txt {&} 10 ) (replace [text]<(?![a-z/?\$!])[/text] txt {<} 10 ) txt) (define (emphasis txt) ; italics/bold: strong first (replace [text] (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 [/text] txt (string {} $2 {}) 8 ) (replace [text] (\*|_) (?=\S) (.+?) (?<=\S) \1 [/text] txt (string {} $2 {}) 8 ) txt) (define (line-breaks txt) ; handles line break markers (replace " {2,}\n" txt "
\n" 0) txt) (define (cleanup txt) ; cleanup the text by normalizing some possible variations (replace "\r\n|\r" txt "\n" 0) ; standardize line ends (push "\n\n" txt -1) ; end with two returns (set 'txt (detab txt)) ; convert tabs to spaces (replace "\n[ \t]+\n" txt "\n\n" 0) ; lines with only spaces and tabs txt) (define (protect txt) ; protect or "hash html blocks" (let ((nested-block-regex [text](^<(p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del)\b(.*\n)*?[ \t]*(?=\n+|\Z))[/text]) (liberal-tag-regex [text](^<(p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math)\b(.*\n)*?.*[ \t]*(?=\n+|\Z))[/text]) (hr-regex [text](?:(?<=\n\n)|\A\n?)([ ]{0,3}<(hr)\b([^<>])*?/?>[ \t]*(?=\n{2,}|\Z))[/text]) (html-comment-regex [text](?:(?<=\n\n)|\A\n?)([ ]{0,3}(?s:)[ \t]*(?=\n{2,}|\Z))[/text]) ) (dolist (rgx (list nested-block-regex liberal-tag-regex hr-regex html-comment-regex)) (replace rgx txt (begin (set 'key (hash $1)) (push (list key $1 ) *hashed-html-blocks* -1) (string "\n\n" key "\n\n")) 2) ) ) txt) (define (unescape-special-chars t) ; Swap back in all the special characters we've hidden. (dolist (pair *escape-table*) (replace (last pair) t (first pair) 10) ) t) (define (strip-link-definitions txt) ; strip link definitions from the text and store them ; Link defs are in the form: ^[id]: url "optional title" ; stored in link db list as (id (url title)) (let ((link-db '()) (url {}) (id {}) (title {}) ) (replace [text]^[ ]{0,3}\[(.+)\]:[ \t]*\n?[ \t]*?[ \t]*\n?[ \t]*(?:(?<=\s)["(](.+?)[")][ \t]*)?(?:\n+|\Z)[/text] txt (begin (set 'id (lower-case $1) 'url (amps-and-angles $2) 'title $3) (if title (replace {"} title {"} 0)) (push (list id (list url title)) link-db) (set '$3 {}) ; necessary? (string {}) ; remove from text ) 10) (set '*link-database* link-db) txt ) ) (define (horizontal-rules txt) (replace [text]^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$[/text] txt "\n
" 14) (replace [text]^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$[/text] txt "\n
" 14) (replace [text]^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$[/text] txt "\n
" 14) txt) (define (headers txt) ; setext headers (let ((level 1) ) (replace [text]^(.+)[ \t]*\n=+[ \t]*\n+[/text] txt (string "

" (span-transforms $1) "

\n\n") 2) (replace [text]^(.+)[ \t]*\n-+[ \t]*\n+[/text] txt (string "

" (span-transforms $1) "

\n\n") 2) ; atx headers (replace [text]^(\#{1,6})\s*(.+?)[ ]*\#*(\n+)[/text] txt (begin (set 'level (length $1)) (string "" (span-transforms $2) "\n\n") ) 2) ) txt) (define (lists txt) (letn ((marker-ul {[*+-]}) (marker-ol {\d+[.]}) (marker-any (string {(?:} marker-ul {|} marker-ol {)})) (whole-list-regex (string [text](([ ]{0,3}([/text] marker-any [text])[ \t]+)(?s:.+?)(\z|\n{2,}(?=\S)(?![ \t]*[/text] marker-any [text][ \t]+)))[/text])) (my-list {}) (list-type {}) (my-result {}) ) (replace (if (> *list-level* 0) (string {^} whole-list-regex) (string {(?:(?<=\n\n)|\A\n?)} whole-list-regex)) txt (begin (set 'my-list $1) (if (find $3 marker-ul) (set 'list-type "ul" 'marker-type marker-ul) (set 'list-type "ol" 'marker-type marker-ol)) (replace [text]\n{2,}[/text] my-list "\n\n\n" 0) (set 'my-result (process-list-items my-list marker-any)) (replace {\s+$} my-result {} 0) (string {<} list-type {>} "\n" my-result "\n" {} "\n")) 10 ; must be multiline ) ) txt ) (define (process-list-items list-text marker-any) (let ((list-regex (string [text](\n)?(^[ \t]*)([/text] marker-any [text])[ \t]+((?s:.+?)(\n{1,2}))(?=\n*(\z|\2([/text] marker-any [text])[ \t]+))[/text])) (item {}) (leading-line {}) (leading-space {}) (result {}) ) (inc '*list-level*) (replace [text]\n{2,}\z[/text] list-text "\n" 0) (set '$1 {} '$2 {} '$3 {} '$4 {} '$5 {}) (replace list-regex list-text (begin (set 'item $4) (set 'leading-line $1) (set 'leading-space $2) (if (or (not (empty? leading-line)) (ends-with item "\n{2,}" 0)) (set 'item (block-transforms (outdent item))) ; recurse for sub lists (begin (set 'item (lists (outdent item))) (set 'item (span-transforms (trim item "\n"))) )) (string {
  • } item {
  • } "\n") ) 10) (dec '*list-level*) list-text ) ) (define (code-blocks txt) (let ((code-block {}) (token-list '()) ) (replace [text](?:\n\n|\A)((?:(?:[ ]{4}|\t).*\n+)+)((?=^[ ]{0,3}\S)|\Z)[/text] txt (begin (set 'code-block $1) ; prettify if it's newlisp and the module is loaded (if (and (starts-with code-block " ;newlisp\n") (context? Tokenize)) ; format newlisp (begin ; remove flag (replace "[ ]{4};newlisp\n" code-block {} 0) (set 'token-list (Tokenize (trim (detab (outdent code-block)) "\n"))) (set 'code-block (protect (string "
    " (Pretty:pretty-output token-list) "\n
    ")))) ; don't format (begin ; trim leading and trailing newlines (set 'code-block (trim (detab (encode-code (outdent code-block))) "\n")) (set '$1 {}) (set 'code-block (string "\n\n
    " code-block "\n
    \n\n"))))) 10 ) ) txt) (define (block-quotes txt) (let ((block-quote {}) ) (replace [text]((^[ \t]*>[ \t]?.+\n(.+\n)*\n*)+)[/text] txt (begin (set 'block-quote $1) (replace {^[ ]*>[ ]?} block-quote {} 2) (replace {^[ ]+$} block-quote {} 2) (set 'block-quote (block-transforms block-quote)) ; recurse ; remove leading spaces (replace {(\s*
    .+?
    )} block-quote (trim $1) 2) (string "
    \n" block-quote "\n
    \n\n")) 2 ) ) txt) (define (outdent s) (replace [text]^(\t|[ ]{1,4})[/text] s {} 2) s) (define (detab s) (replace [text](.*?)\t[/text] s (string $1 (dup { } (- 4 (% (length $1) 4)))) 2) ) (define (form-paragraphs txt) (let ((grafs '()) (original nil) ) (set 'txt (trim txt "\n")) ; strip blank lines before and after (set 'grafs (parse txt "\n{2,}" 0)) ; split (dolist (p grafs) (if (set 'original (lookup p *hashed-html-blocks*)) ; html blocks (nth-set (grafs $idx) original) ; wrap

    tags round everything else (nth-set (grafs $idx) (string {

    } (replace {^[ ]*} (span-transforms p) {} (+ 4 8 16)) {

    })) )) (join grafs "\n\n") ) ) ; the end