#!/usr/bin/env newlisp
;; a port of John Gruber's Markdown.pl (http://daringfireball.net/markdown) script to newLISP...
;; see his original Perl script for explanations of the fearsome regexen and
;; byzantine logic, etc...
;; TODO:
;; the following Markdown tests give different results:
;; nested brackets (see Links, reference style.text)
;; backticks in html tags (see test file Code Spans.text)
;; parens in url : .jpg) see (Images.text)
;; a nested ordered list error (see Ordered and unordered lists.text)
;; some odd backslash escapes in Backslash escapes.text
;; Add: email address scrambling
;;
;; version date 2008-08-08 16:54:56
;; changed (unless to (if (not ...
;;
;; version date 2008-03-07 15:36:09
;; fixed load error
;;
;; version date 2007-11-17 16:20:57
;; added syntax colouring module
;;
;; version date 2007-11-14 09:19:42
;; removed reliance on dostring for compatibility with 9.1
;; author: cormullion
(if (not (context? Pretty))
(load
{tokenizer.lsp}
{prettyprint-css.lsp}))
(context 'markdown)
(define (markdown:markdown txt)
(initialize)
(unescape-special-chars (block-transforms (strip-link-definitions (protect (cleanup txt))))))
; initialize
(define (initialize)
(set '*escape-chars* [text]\`*_{}[]()>#+-.![/text])
(set '*escape-pairs* '(
([text]\\\\[/text] [text]\\[/text])
([text]\\`[/text] [text]`[/text])
([text]\\\*[/text] [text]*[/text])
([text]\\_[/text] [text]_[/text])
([text]\\\{[/text] [text]{[/text])
([text]\\\}[/text] [text]}[/text])
([text]\\\[[/text] [text][[/text])
([text]\\\][/text] [text]][/text])
([text]\\\([/text] [text]([/text])
([text]\\\)[/text] [text])[/text])
([text]\\>[/text] [text]>[/text])
([text]\\\#[/text] [text]#[/text])
([text]\\\+[/text] [text]+[/text])
([text]\\\-[/text] [text]-[/text])
([text]\\\.[/text] [text].[/text])
([text]\\![/text] [text]![/text])
))
(set '*hashed-html-blocks* '())
(build-escape-table)
(set '*list-level* 0))
(define (block-transforms txt)
(form-paragraphs
(protect
(block-quotes
(code-blocks
(lists
(horizontal-rules
(headers txt))))))))
(define (span-transforms txt)
(line-breaks
(emphasis
(amps-and-angles
(auto-links
(anchors
(images
(escape-special-chars
(escape-special-chars-within-tag-attributes
(code-spans txt))))))))))
(define (hash s)
(base64-enc (uuid)))
(define (build-escape-table)
(set '*escape-table* '())
(dolist (c (explode *escape-chars*))
(push (list c (hash c)) *escape-table*)))
(define (tokenize-html xhtml)
; return list of tag/text portions of xhtml text
(letn (
(tag-match [text]((?s:)|
(?s:<\?.*?\?>)|
(?:<[a-z/!$](?:[^<>]|
(?:<[a-z/!$](?:[^<>]|
(?:<[a-z/!$](?:[^<>]|
(?:<[a-z/!$](?:[^<>]|
(?:<[a-z/!$](?:[^<>]|
(?:<[a-z/!$](?:[^<>])*>))*>))*>))*>))*>))*>))[/text]) ; yeah, well...
(str xhtml)
(len (length str))
(pos 0)
(tokens '())
)
(while (set 'tag-start (find tag-match str 8))
(if (< pos tag-start)
(push (list text (slice str pos (- tag-start pos))) tokens -1))
(push (list tag $0) tokens -1)
(set 'str (slice str (+ tag-start (length $0))))
(set 'pos 0))
; leftovers
(if (< pos len)
(push (list (quote text) (slice str pos (- len pos))) tokens -1))
tokens)
)
(define (escape-special-chars-within-tag-attributes txt)
(let ((temp (tokenize-html txt))
(new-text {})
)
(dolist (pair temp)
(if (= (first pair) 'tag)
; 'tag
(begin
(set 'new-text (replace {\\} (last pair) (lookup {\\} *escape-table*) 0))
(replace [text](?<=.)?code>(?=.)[/text] new-text (lookup {`} *escape-table*) 0)
(replace {\*} new-text (lookup {*} *escape-table*) 0)
(replace {_} new-text (lookup {_} *escape-table*) 0)
)
; 'text
(set 'new-text (last pair))
)
(nth-set (temp $idx) (list (first pair) new-text)))
(join (map last temp)) ; return as text
)
)
(define (escape-special-chars txt )
; replaces characters in tags and text tokens
(let ((temp (tokenize-html txt))
(new-text {})
)
(dolist (pair temp)
(if (= (first pair) 'tag)
; 'tag
(begin
(set 'new-text (replace {\\} (last pair) (lookup {\\} *escape-table*) 0))
(replace [text](?<=.)?code>(?=.)[/text] new-text (lookup {`} *escape-table*) 0)
(replace {\*} new-text (lookup {*} *escape-table*) 0)
(replace {_} new-text (lookup {_} *escape-table*) 0)
)
; 'text
(begin
(set 'new-text (encode-backslash-escapes (last pair)))))
(nth-set (temp $idx) (list (first pair) new-text)))
(join (map last temp)) ; return as text
)
)
(define (encode-backslash-escapes t)
(dolist (pair *escape-pairs*)
(replace (first pair) t (lookup (last pair) *escape-table*) 14))
t)
(define (encode-code s)
; encode/escape certain characters inside Markdown code runs
(replace {&} s "&" 0)
(replace {<} s "<" 0)
(replace {>} s ">" 0)
(replace {\*} s (lookup {*} *escape-table*) 0)
(replace {_} s (lookup {_} *escape-table*) 0)
(replace "{" s (lookup "{" *escape-table*) 0)
(replace {\[} s (lookup {[} *escape-table*) 0)
(replace {\]} s (lookup {]} *escape-table*) 0)
(replace {\\} s (lookup {\\} *escape-table*) 0)
s)
(define (code-spans s)
(replace
{(?} (encode-code (trim $2)) {})
2
)
s)
(define (encode-alt s)
(replace {&} s "&" 0)
(replace {"} s """ 0)
s)
(define (images txt)
(let ((alt-text {})
(url {} )
(title {})
(ref-regex {(!\[(.*?)\][ ]?(?:\n[ ]*)?\[(.*?)\])})
(inline-regex {(!\[(.*?)\]\([ \t]*(\S+?)>?[ \t]*((['"])(.*?)\5[ \t]*)?\))})
(whole-match {})
(result {})
(id-ref {})
(url {})
)
; reference links ![alt text][id]
(replace
ref-regex
txt
(begin
(set 'whole-match $1 'alt-text $2 'id-ref $3)
(if alt-text
(replace {"} alt-text {"} 0))
(if (empty? id-ref)
(set 'id-ref (lower-case alt-text)))
(if (lookup id-ref *link-database*)
(set 'url (first (lookup id-ref *link-database*)))
(set 'url nil))
(if url
(begin
(replace {\*} url (lookup {*} *escape-table*) 0)
(replace {_} url (lookup {_} *escape-table*) 0)
))
(if (last (lookup id-ref *link-database*))
; title
(begin
(set 'title (last (lookup id-ref *link-database*)))
(replace {"} title {"} 0)
(replace {\*} title (lookup {*} *escape-table*) 0)
(replace {_} title (lookup {_} *escape-table*) 0))
; no title
(set 'title {})
)
(if url
(set 'result (string
{}))
(set 'result whole-match))
)
0
)
; inline image refs: 
(replace
inline-regex
txt
(begin
(set 'whole-match $1)
(set 'alt-text $2)
(set 'url $3)
(set 'title $6)
(if alt-text
(replace {"} alt-text {"} 0)
(set 'alt-text {}))
(if title
(begin
(replace {"} title {"} 0)
(replace {\*} title (lookup {*} *escape-table*) 0)
(replace {_} title (lookup {_} *escape-table*) 0))
(set 'title {}))
(replace {\*} url (lookup {*} *escape-table*) 0)
(replace {_} url (lookup {_} *escape-table*) 0)
(string
{
})
)
0
)
; empty ones are possible
(set '$1 {})
(replace {!\[(.*?)\]\([ \t]*\)}
txt
(string {
})
0
)
)
txt)
(define (make-anchor link-text id-ref )
; Link defs are in the form: ^[id]: url "optional title"
; stored in link db list as (id (url title))
; params are text to be linked and the id of the link in the db
; eg bar 1 for [bar][1]
(let (
(title {})
(id id-ref)
(url nil)
)
(if link-text
(begin
(replace {"} link-text {"} 0)
(replace {\n} link-text { } 0)
(replace {[ ]?\n} link-text { } 0)
))
(if (null? id ) (set 'id (lower-case link-text)))
(if (not (nil? (lookup id *link-database*)))
(begin
(set 'url (first (lookup id *link-database*)))
(replace {\*} url (lookup {*} *escape-table*) 0)
(replace {_} url (lookup {_} *escape-table*) 0)
(if (set 'title (last (lookup id *link-database*)))
(begin
(replace {"} title {"} 0)
(replace {\*} title (lookup {*} *escape-table*) 0)
(replace {_} title (lookup {_} *escape-table*) 0))
(set 'title {})))
(set 'url nil))
(if url
(string {} link-text {})
(string {[} link-text {][} id-ref {]}))
)
)
(define (anchors txt)
(letn ((nested-brackets {(?>[^\[\]]+)*})
(ref-link-regex (string {(\[(} nested-brackets {)\][ ]?(?:\n[ ]*)?\[(.*?)\])}))
(inline-regex {(\[(.*?)\]\([ ]*(.*?\)?)>?[ ]*((['"])(.*?)\5[ \t]*)?\))})
(link-text {})
(url {})
(title {})
)
; reference-style links: [link text] [id]
(set '$1 {} '$2 {} '$3 {} '$4 {} '$5 {} '$6 {}) ; i still don't think I should have to do this...
(replace ref-link-regex txt (make-anchor $2 $3) 8) ; $2 is link text, $3 is id
; inline links: [link text](url "optional title")
(set '$1 {} '$2 {} '$3 {} '$4 {} '$5 {} '$6 {})
(replace
inline-regex
txt
(begin
(set 'link-text $2)
(set 'url $3)
(set 'title $6)
(if link-text (replace {"} link-text {"} 0))
(if title
(begin
(replace {"} title {"} 0)
(replace {\*} title (lookup {*} *escape-table*) 0)
(replace {_} title (lookup {_} *escape-table*) 0))
(set 'title {}))
(replace {\*} url (lookup {*} *escape-table*) 0)
(replace {_} url (lookup {_} *escape-table*) 0)
(replace {^<(.*)>$} url $1 0)
(string
{} link-text {}
))
8
) ; replace
)
txt)
(define (auto-links txt)
(replace
[text]<((https?|ftp):[^'">\s]+)>[/text]
txt
(string {} $1 {})
0
)
; to-do: email ...
txt)
(define (amps-and-angles txt)
; Smart processing for ampersands and angle brackets
(replace
[text]&(?!\#?[xX]?(?:[0-9a-fA-F]+|\w+);)[/text]
txt
{&}
10
)
(replace
[text]<(?![a-z/?\$!])[/text]
txt
{<}
10
)
txt)
(define (emphasis txt)
; italics/bold: strong first
(replace
[text] (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 [/text]
txt
(string {} $2 {})
8
)
(replace
[text] (\*|_) (?=\S) (.+?) (?<=\S) \1 [/text]
txt
(string {} $2 {})
8
)
txt)
(define (line-breaks txt)
; handles line break markers
(replace " {2,}\n" txt "
\n" 0)
txt)
(define (cleanup txt)
; cleanup the text by normalizing some possible variations
(replace "\r\n|\r" txt "\n" 0) ; standardize line ends
(push "\n\n" txt -1) ; end with two returns
(set 'txt (detab txt)) ; convert tabs to spaces
(replace "\n[ \t]+\n" txt "\n\n" 0) ; lines with only spaces and tabs
txt)
(define (protect txt)
; protect or "hash html blocks"
(let ((nested-block-regex [text](^<(p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del)\b(.*\n)*?\2>[ \t]*(?=\n+|\Z))[/text])
(liberal-tag-regex [text](^<(p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math)\b(.*\n)*?.*\2>[ \t]*(?=\n+|\Z))[/text])
(hr-regex [text](?:(?<=\n\n)|\A\n?)([ ]{0,3}<(hr)\b([^<>])*?/?>[ \t]*(?=\n{2,}|\Z))[/text])
(html-comment-regex [text](?:(?<=\n\n)|\A\n?)([ ]{0,3}(?s:)[ \t]*(?=\n{2,}|\Z))[/text])
)
(dolist (rgx (list nested-block-regex liberal-tag-regex hr-regex html-comment-regex))
(replace
rgx
txt
(begin
(set 'key (hash $1))
(push (list key $1 ) *hashed-html-blocks* -1)
(string "\n\n" key "\n\n"))
2)
)
)
txt)
(define (unescape-special-chars t)
; Swap back in all the special characters we've hidden.
(dolist (pair *escape-table*)
(replace (last pair) t (first pair) 10)
)
t)
(define (strip-link-definitions txt)
; strip link definitions from the text and store them
; Link defs are in the form: ^[id]: url "optional title"
; stored in link db list as (id (url title))
(let ((link-db '())
(url {})
(id {})
(title {})
)
(replace
[text]^[ ]{0,3}\[(.+)\]:[ \t]*\n?[ \t]*(\S+?)>?[ \t]*\n?[ \t]*(?:(?<=\s)["(](.+?)[")][ \t]*)?(?:\n+|\Z)[/text]
txt
(begin
(set 'id (lower-case $1) 'url (amps-and-angles $2) 'title $3)
(if title (replace {"} title {"} 0))
(push (list id (list url title)) link-db)
(set '$3 {}) ; necessary?
(string {}) ; remove from text
)
10)
(set '*link-database* link-db)
txt
)
)
(define (horizontal-rules txt)
(replace
[text]^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$[/text]
txt
"\n
" (Pretty:pretty-output token-list) "\n"))))
; don't format
(begin
; trim leading and trailing newlines
(set 'code-block (trim (detab (encode-code (outdent code-block))) "\n"))
(set '$1 {})
(set 'code-block (string "\n\n" code-block "\n\n\n")))))
10
)
)
txt)
(define (block-quotes txt)
(let ((block-quote {})
)
(replace
[text]((^[ \t]*>[ \t]?.+\n(.+\n)*\n*)+)[/text]
txt
(begin
(set 'block-quote $1)
(replace {^[ ]*>[ ]?} block-quote {} 2)
(replace {^[ ]+$} block-quote {} 2)
(set 'block-quote (block-transforms block-quote)) ; recurse
; remove leading spaces
(replace
{(\s*.+?)} block-quote (trim $1) 2) (string "
\n" block-quote "\n\n\n")) 2 ) ) txt) (define (outdent s) (replace [text]^(\t|[ ]{1,4})[/text] s {} 2) s) (define (detab s) (replace [text](.*?)\t[/text] s (string $1 (dup { } (- 4 (% (length $1) 4)))) 2) ) (define (form-paragraphs txt) (let ((grafs '()) (original nil) ) (set 'txt (trim txt "\n")) ; strip blank lines before and after (set 'grafs (parse txt "\n{2,}" 0)) ; split (dolist (p grafs) (if (set 'original (lookup p *hashed-html-blocks*)) ; html blocks (nth-set (grafs $idx) original) ; wrap
tags round everything else (nth-set (grafs $idx) (string {
} (replace {^[ ]*} (span-transforms p) {} (+ 4 8 16)) {
})) )) (join grafs "\n\n") ) ) ; the end