[functional-tests] more work on the regex engine

This commit is contained in:
Joe Thornber 2017-08-18 13:21:17 +01:00
parent ca16f0eac8
commit 59a77ee44a

View File

@ -1,7 +1,14 @@
(library (library
(regex) (regex)
(export compile-rx) (export lit
cat
alt
opt
star
plus
compile-rx)
(import (chezscheme) (import (chezscheme)
(loops)
(matchable)) (matchable))
;; Simple regex library, because it's friday and I'm bored. ;; Simple regex library, because it's friday and I'm bored.
@ -16,7 +23,7 @@
;; (star rx) ;; (star rx)
;; (plus rx) ;; (plus rx)
;; ;;
;; The expressions get compile-rxd into a vector of vm instructions. ;; The expressions get compiled into a vector of vm instructions.
;; (char c) ;; (char c)
;; (match) ;; (match)
;; (jmp x) ;; (jmp x)
@ -25,67 +32,115 @@
;; instructions are closures that manipulate the thread ;; instructions are closures that manipulate the thread
;; FIXME: slow ;; FIXME: slow
(define (append-instr code . i) (define (append-instr code . i) (append code i))
(append code i)) (define (label-instr l) `(label ,l))
(define (jmp-instr l) `(jmp ,l))
(define (label-instr l) (define (char-instr c) `(char ,c))
`(label ,l)) (define (split-instr l1 l2) `(split ,l1 ,l2))
(define (match-instr) '(match))
(define (jmp-instr l) (define (match-instr? instr) (equal? '(match) instr))
`(jmp ,l))
(define (char-instr c)
`(char ,c))
(define (split-instr l1 l2)
`(split ,l1 ,l2))
(define (match-instr)
'(match))
(define (label-code label code) (define (label-code label code)
(cons (label-instr label) code)) (cons (label-instr label) code))
;; Compiles to a list of labelled instructions that can later be flattened ;; Compiles to a list of labelled instructions that can later be flattened
;; into a linear sequence. ;; into a linear sequence.
(define (compile-rx exp) (define (lit str)
(match exp (map char-instr (string->list str)))
(('lit str)
(map char-instr (string->list str)))
(('cat rx1 rx2) (define (cat rx1 rx2)
(append (compile-rx rx1) (compile-rx rx2))) (append rx1 rx2))
(('alt rx1 rx2) (define (alt rx1 rx2)
(let ((label1 (gensym)) (let ((label1 (gensym))
(label2 (gensym))) (label2 (gensym))
(let ((c1 (label-code label1 (compile-rx rx1))) (tail (gensym)))
(c2 (label-code label2 (compile-rx rx2)))) (let ((c1 (label-code label1
(cons (split-instr label1 label2) (append-instr rx1 (jmp-instr tail))))
(append c1 c2))))) (c2 (label-code label2 rx2)))
(cons (split-instr label1 label2)
(append-instr (append c1 c2) (label-instr tail))))))
(('opt rx) (define (opt rx)
(let ((head (gensym)) (let ((head (gensym))
(tail (gensym))) (tail (gensym)))
(cons (split-instr head tail) (cons (split-instr head tail)
(label-code head (append-instr (compile-rx rx) (label-code head
(label-instr tail)))))) (append-instr rx (label-instr tail))))))
(('star rx) (define (star rx)
(let ((head (gensym)) (let ((head (gensym))
(tail (gensym))) (tail (gensym)))
(cons (split-instr head tail) (cons (split-instr head tail)
(label-code head (append-instr (compile-rx rx) (label-code head
(jmp-instr head) (append-instr rx
(label-instr tail)))))) (jmp-instr head)
(label-instr tail))))))
(('plus rx)
(let ((head (gensym)) (define (plus rx)
(tail (gensym))) (let ((head (gensym))
(label-code head (tail (gensym)))
(append-instr (compile-rx rx) (label-code head
(split-instr head tail) (append-instr rx
(label-instr tail))))))) (split-instr head tail)
(label-instr tail)))))
(define (label-locations code)
(let ((locs (make-eq-hashtable)))
(let loop ((pc 0)
(code code))
(if (null? code)
locs
(match (car code)
(('label l)
(begin
(hashtable-set! locs l pc)
(loop pc (cdr code))))
(instr
(loop (+ 1 pc) (cdr code))))))))
(define (remove-labels code locs)
(let loop ((pc 0)
(code code)
(acc '()))
(if (null? code)
(reverse acc)
(match (car code)
(('label l)
(loop pc (cdr code) acc))
(('jmp l)
(loop (+ 1 pc) (cdr code)
(cons `(jmp ,(hashtable-ref locs l #f)) acc)))
(('split l1 l2)
(loop (+ 1 pc) (cdr code)
(cons `(split ,(hashtable-ref locs l1 #f)
,(hashtable-ref locs l2 #f))
acc)))
(instr (loop (+ 1 pc) (cdr code) (cons instr acc)))))))
(define (optimise-jumps! code)
(upto (n (vector-length code))
(match (vector-ref code n)
(('jmp l)
(when (match-instr? (vector-ref code l))
(vector-set! code n (match-instr))))
(('split l1 l2)
(when (or (match-instr? (vector-ref code l1))
(match-instr? (vector-ref code l2)))
(vector-set! code n (match-instr))))
(_ _)))
code)
(define (compile-rx rx)
(let ((rx (append-instr rx (match-instr))))
(optimise-jumps!
(list->vector
(remove-labels rx (label-locations rx))))))
;; A 'thread' consists of an index into the instructions. A 'bundle' holds ;; A 'thread' consists of an index into the instructions. A 'bundle' holds
;; the current threads. Note there cannot be more threads than instructions, ;; the current threads. Note there cannot be more threads than instructions,