[functional-tests] more work on the regex engine
This commit is contained in:
parent
ca16f0eac8
commit
59a77ee44a
@ -1,7 +1,14 @@
|
|||||||
(library
|
(library
|
||||||
(regex)
|
(regex)
|
||||||
(export compile-rx)
|
(export lit
|
||||||
|
cat
|
||||||
|
alt
|
||||||
|
opt
|
||||||
|
star
|
||||||
|
plus
|
||||||
|
compile-rx)
|
||||||
(import (chezscheme)
|
(import (chezscheme)
|
||||||
|
(loops)
|
||||||
(matchable))
|
(matchable))
|
||||||
|
|
||||||
;; Simple regex library, because it's friday and I'm bored.
|
;; Simple regex library, because it's friday and I'm bored.
|
||||||
@ -16,7 +23,7 @@
|
|||||||
;; (star rx)
|
;; (star rx)
|
||||||
;; (plus rx)
|
;; (plus rx)
|
||||||
;;
|
;;
|
||||||
;; The expressions get compile-rxd into a vector of vm instructions.
|
;; The expressions get compiled into a vector of vm instructions.
|
||||||
;; (char c)
|
;; (char c)
|
||||||
;; (match)
|
;; (match)
|
||||||
;; (jmp x)
|
;; (jmp x)
|
||||||
@ -25,67 +32,115 @@
|
|||||||
;; instructions are closures that manipulate the thread
|
;; instructions are closures that manipulate the thread
|
||||||
|
|
||||||
;; FIXME: slow
|
;; FIXME: slow
|
||||||
(define (append-instr code . i)
|
(define (append-instr code . i) (append code i))
|
||||||
(append code i))
|
(define (label-instr l) `(label ,l))
|
||||||
|
(define (jmp-instr l) `(jmp ,l))
|
||||||
(define (label-instr l)
|
(define (char-instr c) `(char ,c))
|
||||||
`(label ,l))
|
(define (split-instr l1 l2) `(split ,l1 ,l2))
|
||||||
|
(define (match-instr) '(match))
|
||||||
(define (jmp-instr l)
|
(define (match-instr? instr) (equal? '(match) instr))
|
||||||
`(jmp ,l))
|
|
||||||
|
|
||||||
(define (char-instr c)
|
|
||||||
`(char ,c))
|
|
||||||
|
|
||||||
(define (split-instr l1 l2)
|
|
||||||
`(split ,l1 ,l2))
|
|
||||||
|
|
||||||
(define (match-instr)
|
|
||||||
'(match))
|
|
||||||
|
|
||||||
(define (label-code label code)
|
(define (label-code label code)
|
||||||
(cons (label-instr label) code))
|
(cons (label-instr label) code))
|
||||||
|
|
||||||
;; Compiles to a list of labelled instructions that can later be flattened
|
;; Compiles to a list of labelled instructions that can later be flattened
|
||||||
;; into a linear sequence.
|
;; into a linear sequence.
|
||||||
(define (compile-rx exp)
|
(define (lit str)
|
||||||
(match exp
|
(map char-instr (string->list str)))
|
||||||
(('lit str)
|
|
||||||
(map char-instr (string->list str)))
|
|
||||||
|
|
||||||
(('cat rx1 rx2)
|
(define (cat rx1 rx2)
|
||||||
(append (compile-rx rx1) (compile-rx rx2)))
|
(append rx1 rx2))
|
||||||
|
|
||||||
(('alt rx1 rx2)
|
(define (alt rx1 rx2)
|
||||||
(let ((label1 (gensym))
|
(let ((label1 (gensym))
|
||||||
(label2 (gensym)))
|
(label2 (gensym))
|
||||||
(let ((c1 (label-code label1 (compile-rx rx1)))
|
(tail (gensym)))
|
||||||
(c2 (label-code label2 (compile-rx rx2))))
|
(let ((c1 (label-code label1
|
||||||
(cons (split-instr label1 label2)
|
(append-instr rx1 (jmp-instr tail))))
|
||||||
(append c1 c2)))))
|
(c2 (label-code label2 rx2)))
|
||||||
|
(cons (split-instr label1 label2)
|
||||||
|
(append-instr (append c1 c2) (label-instr tail))))))
|
||||||
|
|
||||||
(('opt rx)
|
(define (opt rx)
|
||||||
(let ((head (gensym))
|
(let ((head (gensym))
|
||||||
(tail (gensym)))
|
(tail (gensym)))
|
||||||
(cons (split-instr head tail)
|
(cons (split-instr head tail)
|
||||||
(label-code head (append-instr (compile-rx rx)
|
(label-code head
|
||||||
(label-instr tail))))))
|
(append-instr rx (label-instr tail))))))
|
||||||
|
|
||||||
(('star rx)
|
(define (star rx)
|
||||||
(let ((head (gensym))
|
(let ((head (gensym))
|
||||||
(tail (gensym)))
|
(tail (gensym)))
|
||||||
(cons (split-instr head tail)
|
(cons (split-instr head tail)
|
||||||
(label-code head (append-instr (compile-rx rx)
|
(label-code head
|
||||||
(jmp-instr head)
|
(append-instr rx
|
||||||
(label-instr tail))))))
|
(jmp-instr head)
|
||||||
|
(label-instr tail))))))
|
||||||
(('plus rx)
|
|
||||||
(let ((head (gensym))
|
(define (plus rx)
|
||||||
(tail (gensym)))
|
(let ((head (gensym))
|
||||||
(label-code head
|
(tail (gensym)))
|
||||||
(append-instr (compile-rx rx)
|
(label-code head
|
||||||
(split-instr head tail)
|
(append-instr rx
|
||||||
(label-instr tail)))))))
|
(split-instr head tail)
|
||||||
|
(label-instr tail)))))
|
||||||
|
|
||||||
|
(define (label-locations code)
|
||||||
|
(let ((locs (make-eq-hashtable)))
|
||||||
|
(let loop ((pc 0)
|
||||||
|
(code code))
|
||||||
|
(if (null? code)
|
||||||
|
locs
|
||||||
|
(match (car code)
|
||||||
|
(('label l)
|
||||||
|
(begin
|
||||||
|
(hashtable-set! locs l pc)
|
||||||
|
(loop pc (cdr code))))
|
||||||
|
(instr
|
||||||
|
(loop (+ 1 pc) (cdr code))))))))
|
||||||
|
|
||||||
|
(define (remove-labels code locs)
|
||||||
|
(let loop ((pc 0)
|
||||||
|
(code code)
|
||||||
|
(acc '()))
|
||||||
|
(if (null? code)
|
||||||
|
(reverse acc)
|
||||||
|
(match (car code)
|
||||||
|
(('label l)
|
||||||
|
(loop pc (cdr code) acc))
|
||||||
|
|
||||||
|
(('jmp l)
|
||||||
|
(loop (+ 1 pc) (cdr code)
|
||||||
|
(cons `(jmp ,(hashtable-ref locs l #f)) acc)))
|
||||||
|
|
||||||
|
(('split l1 l2)
|
||||||
|
(loop (+ 1 pc) (cdr code)
|
||||||
|
(cons `(split ,(hashtable-ref locs l1 #f)
|
||||||
|
,(hashtable-ref locs l2 #f))
|
||||||
|
acc)))
|
||||||
|
|
||||||
|
(instr (loop (+ 1 pc) (cdr code) (cons instr acc)))))))
|
||||||
|
|
||||||
|
(define (optimise-jumps! code)
|
||||||
|
(upto (n (vector-length code))
|
||||||
|
(match (vector-ref code n)
|
||||||
|
(('jmp l)
|
||||||
|
(when (match-instr? (vector-ref code l))
|
||||||
|
(vector-set! code n (match-instr))))
|
||||||
|
|
||||||
|
(('split l1 l2)
|
||||||
|
(when (or (match-instr? (vector-ref code l1))
|
||||||
|
(match-instr? (vector-ref code l2)))
|
||||||
|
(vector-set! code n (match-instr))))
|
||||||
|
|
||||||
|
(_ _)))
|
||||||
|
code)
|
||||||
|
|
||||||
|
(define (compile-rx rx)
|
||||||
|
(let ((rx (append-instr rx (match-instr))))
|
||||||
|
(optimise-jumps!
|
||||||
|
(list->vector
|
||||||
|
(remove-labels rx (label-locations rx))))))
|
||||||
|
|
||||||
;; A 'thread' consists of an index into the instructions. A 'bundle' holds
|
;; A 'thread' consists of an index into the instructions. A 'bundle' holds
|
||||||
;; the current threads. Note there cannot be more threads than instructions,
|
;; the current threads. Note there cannot be more threads than instructions,
|
||||||
|
Loading…
Reference in New Issue
Block a user