#!/usr/bin/env newlisp ;; @module tokenizer.lsp ;; @description experimental tokenizer for newLISP source ;; @version 0.0 3 of 2008-08-08 17:10:36 - unless removed ;; @version 0.0.2 of 2008-01-09 19:27:26 ; - a stupid list indexing overrun bug now cruelly exposed by newLISP version 9.3... :) ;; @version 0.0.1 of 2007-11-30 13:55:03 ; - first version ;; @author cormullion ; usage ; (context MAIN) ; (load {tokenizer.lsp}) ; (println (Tokenize (read-file {source-file.lsp}))) (context 'Tokenize) (define (Tokenize:Tokenize txt) ;; tokenize the source in txt ;; Based on newLISP guru Fanda's code to scan source and split according to category: ;; Mode: 0 - code ;; 1 - "" ;; 2 - {} ;; 3 - [text][text] ;; 4 - ; or # (let (i 0 ch "" mode 0 txt-length 0 code-starts true n-brackets 0 result '() token-list '() raw-tokens '() source-string "" ) (if (not txt) (exit)) ; beginning of text (set 'ch (txt 0)) (if (= ch {"}) (set 'mode 1) (= ch "{" ) (set 'mode 2) (= "[text]" (0 6 txt)) (set 'mode 3) (or (= ch ";" ) (= ch "#" )) (set 'mode 4) (set 'mode 0)) (push (list mode 0) result -1) ;; skip char(s) (if (= mode 3) (inc 'i 6) (inc 'i)) ;; main processing loop (set 'txt-length (length txt)) (while (< i txt-length) (set 'ch (txt i)) (case mode (0 (begin (if (= ch {"}) (begin (set 'mode 1) (push (list mode i) result -1)) (= ch "{") (begin (inc 'n-brackets) (set 'mode 2) (push (list mode i) result -1)) (= "[text]" (i 6 txt)) (begin (set 'mode 3) (push (list mode i) result -1) (inc 'i 5)) (or (= ch ";" ) (= ch "#" )) (begin (set 'mode 4) (push (list mode i) result -1)) (if code-starts (begin (set 'code-starts nil) (push (list mode i) result -1)))) )) (1 (if (= ch {\}) (inc 'i) (= ch {"}) (begin ; don't mark the end... ; (push (list mode i) result -1) (set 'mode 0) (set 'code-starts true)))) (2 (if (= ch "{") (inc 'n-brackets) (= ch "}") (begin (dec 'n-brackets) (if (= 0 n-brackets) (begin ;(push (list mode i) result -1) (set 'mode 0) (set 'code-starts true)))))) (3 (if (= "[/text]" (i 7 txt)) (begin (inc 'i 6) ; (push (list mode i) result -1) (set 'mode 0) (set 'code-starts true)))) (4 (if (= ch "\n" ) (begin ; return "\n" into code (push (list mode i) result -1) (dec 'i) (set 'mode 0) (set 'code-starts true))))) (inc 'i)) ; end the code according to the current mode (push (list mode (length txt)) result -1) ; convert to list of tokens (for (p 0 (- (length result) 2)) ; fixed 0.0.2 (set 'type (first (result p))) (set 'start (last (result p))) (set 'end (last (result (+ p 1)))) (set 'source-string (slice txt start (- end start))) (cond ((= start end) (push (list "white-space" source-string) token-list -1)) ((= type 0) ; replace : with colon, otherwise it will be ; stripped out by parse (replace ":" source-string " colon " ) ; bad code will not parse! (set 'raw-tokens (parse source-string)) (dolist (tkn raw-tokens) (cond ((= tkn "(" ) (push (list "left-paren" tkn) token-list -1)) ((= tkn ")" ) (push (list "right-paren" tkn) token-list -1)) ((= tkn "colon" ) (push (list "code" ":" ) token-list -1)) (true (push (list "code" tkn) token-list -1))))) ((= type 1) (push (list "quoted-string" source-string) token-list -1)) ((= type 2) (push (list "braced-string" source-string) token-list -1)) ((= type 3) (push (list "bracketed-string" source-string) token-list -1)) ((= type 4) (push (list "comment" source-string) token-list -1))) ) token-list)) (context MAIN) ; EOF ; quick test ;(println (Tokenize (read-file (main-args 1))))