code.H3RALD.com — min: 1de1f3d51a6537dc7d391d86a12a67c16bd4fec7

Implemented support for regular expressions via slre.

h3rald h3rald@h3rald.com

Sun, 14 Dec 2014 14:46:12 +0100

commit

1de1f3d51a6537dc7d391d86a12a67c16bd4fec7

parent

a410d018e5c1c59ba5d1f1a4057cee0f0948f522

11 files changed, 923 insertions(+), 997 deletions(-)

jump to

lib/prelude.min

lib/regex.nim

minim.nim

vendor/slre.nim

vendor/slre/libslre.c

vendor/slre/slre.h

M lib/prelude.min → lib/prelude.min

@@ -20,6 +20,7 @@ [unquote]     [i]           :
 [unquote]     [apply]       :
 [filter]      [select]      :
 [clear]       [empty]       :  
+[match]       [~]           :
 
 // Common Environment Variables
 [os "windows" ==]

A lib/regex.nim

@@ -0,0 +1,36 @@ 
+import tables
+import ../core/parser, ../core/interpreter, ../core/utils
+import ../vendor/slre
+
+minsym "match":
+  let reg = i.pop
+  let str = i.pop
+  if str.isString and reg.isString:
+    var matches = str.strVal.match(reg.strVal)
+    var res = newSeq[TMinValue](0)
+    for s in matches:
+      res.add s.newVal
+    i.push res.newVal
+  else:
+    i.error(errIncorrect, "Two strings are required on the stack")
+
+minsym "match?":
+  let reg = i.pop
+  let str = i.pop
+  if str.isString and reg.isString:
+    var matches = str.strVal.match(reg.strVal)
+    if matches.len > 0:
+      i.push true.newVal
+    else:
+      i.push false.newVal
+  else:
+    i.error(errIncorrect, "Two strings are required on the stack")
+
+minsym "gsub":
+  let s_replace = i.pop
+  let reg = i.pop
+  let s_find = i.pop
+  if reg.isString and s_replace.isString and s_find.isString:
+    i.push s_find.strVal.gsub(reg.strVal, s_replace.strVal).newVal
+  else:
+    i.error(errIncorrect, "Three strings are required on the stack")

M minim.nim → minim.nim

@@ -10,7 +10,8 @@ lib/numbers,
   lib/logic,
   lib/time, 
   lib/io,
-  lib/sys
+  lib/sys,
+  lib/regex
 
 const version* = "0.1.0"
 var debugging = false

D vendor/T-Rex/history.txt

@@ -1,15 +0,0 @@ 
-===version 1.3
--fixed a bug for GCC users(thx Brendan)
-
-===version 1.2
--added word boundary match \b and \B
--added vertical tab escape \v
--\w now also matches '_' (underscore)
--fixed greediness for * and +
-
-===version 1.1 , April 1, 2004
--fixed some minor bug
--added predefined character classes(\w,\W,\s,\S etc...)
-
-===version 1.0 , February 23, 2004
--first public realase

D vendor/T-Rex/libtrex.c

@@ -1,652 +0,0 @@ 
-/* see copyright notice in trex.h */
-#include <string.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <setjmp.h>
-#include "trex.h"
-
-#ifdef _UINCODE
-#define scisprint iswprint
-#define scstrlen wcslen
-#define scprintf wprintf
-#define _SC(x) L(x)
-#else
-#define scisprint isprint
-#define scstrlen strlen
-#define scprintf printf
-#define _SC(x) (x)
-#endif
-
-#ifdef _DEBUG
-#include <stdio.h>
-
-static const TRexChar *g_nnames[] =
-{
-	_SC("NONE"),_SC("OP_GREEDY"),	_SC("OP_OR"),
-	_SC("OP_EXPR"),_SC("OP_NOCAPEXPR"),_SC("OP_DOT"),	_SC("OP_CLASS"),
-	_SC("OP_CCLASS"),_SC("OP_NCLASS"),_SC("OP_RANGE"),_SC("OP_CHAR"),
-	_SC("OP_EOL"),_SC("OP_BOL"),_SC("OP_WB")
-};
-
-#endif
-#define OP_GREEDY		(MAX_CHAR+1) // * + ? {n}
-#define OP_OR			(MAX_CHAR+2)
-#define OP_EXPR			(MAX_CHAR+3) //parentesis ()
-#define OP_NOCAPEXPR	(MAX_CHAR+4) //parentesis (?:)
-#define OP_DOT			(MAX_CHAR+5)
-#define OP_CLASS		(MAX_CHAR+6)
-#define OP_CCLASS		(MAX_CHAR+7)
-#define OP_NCLASS		(MAX_CHAR+8) //negates class the [^
-#define OP_RANGE		(MAX_CHAR+9)
-#define OP_CHAR			(MAX_CHAR+10)
-#define OP_EOL			(MAX_CHAR+11)
-#define OP_BOL			(MAX_CHAR+12)
-#define OP_WB			(MAX_CHAR+13)
-
-#define TREX_SYMBOL_ANY_CHAR ('.')
-#define TREX_SYMBOL_GREEDY_ONE_OR_MORE ('+')
-#define TREX_SYMBOL_GREEDY_ZERO_OR_MORE ('*')
-#define TREX_SYMBOL_GREEDY_ZERO_OR_ONE ('?')
-#define TREX_SYMBOL_BRANCH ('|')
-#define TREX_SYMBOL_END_OF_STRING ('$')
-#define TREX_SYMBOL_BEGINNING_OF_STRING ('^')
-#define TREX_SYMBOL_ESCAPE_CHAR ('\\')
-
-
-typedef int TRexNodeType;
-
-typedef struct tagTRexNode{
-	TRexNodeType type;
-	int left;
-	int right;
-	int next;
-}TRexNode;
-
-struct TRex{
-	const TRexChar *_eol;
-	const TRexChar *_bol;
-	const TRexChar *_p;
-	int _first;
-	int _op;
-	TRexNode *_nodes;
-	int _nallocated;
-	int _nsize;
-	int _nsubexpr;
-	TRexMatch *_matches;
-	int _currsubexp;
-	void *_jmpbuf;
-	const TRexChar **_error;
-};
-
-static int trex_list(TRex *exp);
-
-static int trex_newnode(TRex *exp, TRexNodeType type)
-{
-	TRexNode n;
-	int newid;
-	n.type = type;
-	n.next = n.right = n.left = -1;
-	if(type == OP_EXPR)
-		n.right = exp->_nsubexpr++;
-	if(exp->_nallocated < (exp->_nsize + 1)) {
-		int oldsize = exp->_nallocated;
-		exp->_nallocated *= 2;
-		exp->_nodes = (TRexNode *)realloc(exp->_nodes, exp->_nallocated * sizeof(TRexNode));
-	}
-	exp->_nodes[exp->_nsize++] = n;
-	newid = exp->_nsize - 1;
-	return (int)newid;
-}
-
-static void trex_error(TRex *exp,const TRexChar *error)
-{
-	if(exp->_error) *exp->_error = error;
-	longjmp(*((jmp_buf*)exp->_jmpbuf),-1);
-}
-
-static void trex_expect(TRex *exp, int n){
-	if((*exp->_p) != n)
-		trex_error(exp, _SC("expected paren"));
-	exp->_p++;
-}
-
-static TRexChar trex_escapechar(TRex *exp)
-{
-	if(*exp->_p == TREX_SYMBOL_ESCAPE_CHAR){
-		exp->_p++;
-		switch(*exp->_p) {
-		case 'v': exp->_p++; return '\v';
-		case 'n': exp->_p++; return '\n';
-		case 't': exp->_p++; return '\t';
-		case 'r': exp->_p++; return '\r';
-		case 'f': exp->_p++; return '\f';
-		default: return (*exp->_p++);
-		}
-	} else if(!scisprint(*exp->_p)) trex_error(exp,_SC("letter expected"));
-	return (*exp->_p++);
-}
-
-static int trex_charclass(TRex *exp,int classid)
-{
-	int n = trex_newnode(exp,OP_CCLASS);
-	exp->_nodes[n].left = classid;
-	return n;
-}
-
-static int trex_charnode(TRex *exp,TRexBool isclass)
-{
-	TRexChar t;
-	if(*exp->_p == TREX_SYMBOL_ESCAPE_CHAR) {
-		exp->_p++;
-		switch(*exp->_p) {
-			case 'n': exp->_p++; return trex_newnode(exp,'\n');
-			case 't': exp->_p++; return trex_newnode(exp,'\t');
-			case 'r': exp->_p++; return trex_newnode(exp,'\r');
-			case 'f': exp->_p++; return trex_newnode(exp,'\f');
-			case 'v': exp->_p++; return trex_newnode(exp,'\v');
-			case 'a': case 'A': case 'w': case 'W': case 's': case 'S':
-			case 'd': case 'D': case 'x': case 'X': case 'c': case 'C':
-			case 'p': case 'P': case 'l': case 'u':
-				{
-				t = *exp->_p; exp->_p++;
-				return trex_charclass(exp,t);
-				}
-			case 'b':
-			case 'B':
-				if(!isclass) {
-					int node = trex_newnode(exp,OP_WB);
-					exp->_nodes[node].left = *exp->_p;
-					exp->_p++;
-					return node;
-				} //else default
-			default:
-				t = *exp->_p; exp->_p++;
-				return trex_newnode(exp,t);
-		}
-	}
-	else if(!scisprint(*exp->_p)) {
-
-		trex_error(exp,_SC("letter expected"));
-	}
-	t = *exp->_p; exp->_p++;
-	return trex_newnode(exp,t);
-}
-static int trex_class(TRex *exp)
-{
-	int ret = -1;
-	int first = -1,chain;
-	if(*exp->_p == TREX_SYMBOL_BEGINNING_OF_STRING){
-		ret = trex_newnode(exp,OP_NCLASS);
-		exp->_p++;
-	}else ret = trex_newnode(exp,OP_CLASS);
-
-	if(*exp->_p == ']') trex_error(exp,_SC("empty class"));
-	chain = ret;
-	while(*exp->_p != ']' && exp->_p != exp->_eol) {
-		if(*exp->_p == '-' && first != -1){
-			int r,t;
-			if(*exp->_p++ == ']') trex_error(exp,_SC("unfinished range"));
-			r = trex_newnode(exp,OP_RANGE);
-			if(first>*exp->_p) trex_error(exp,_SC("invalid range"));
-			if(exp->_nodes[first].type == OP_CCLASS) trex_error(exp,_SC("cannot use character classes in ranges"));
-			exp->_nodes[r].left = exp->_nodes[first].type;
-			t = trex_escapechar(exp);
-			exp->_nodes[r].right = t;
-            exp->_nodes[chain].next = r;
-			chain = r;
-			first = -1;
-		}
-		else{
-			if(first!=-1){
-				int c = first;
-				exp->_nodes[chain].next = c;
-				chain = c;
-				first = trex_charnode(exp,TRex_True);
-			}
-			else{
-				first = trex_charnode(exp,TRex_True);
-			}
-		}
-	}
-	if(first!=-1){
-		int c = first;
-		exp->_nodes[chain].next = c;
-		chain = c;
-		first = -1;
-	}
-	/* hack? */
-	exp->_nodes[ret].left = exp->_nodes[ret].next;
-	exp->_nodes[ret].next = -1;
-	return ret;
-}
-
-static int trex_parsenumber(TRex *exp)
-{
-	int ret = *exp->_p-'0';
-	int positions = 10;
-	exp->_p++;
-	while(isdigit(*exp->_p)) {
-		ret = ret*10+(*exp->_p++-'0');
-		if(positions==1000000000) trex_error(exp,_SC("overflow in numeric constant"));
-		positions *= 10;
-	};
-	return ret;
-}
-
-static int trex_element(TRex *exp)
-{
-	int ret = -1;
-	switch(*exp->_p)
-	{
-	case '(': {
-		int expr,newn;
-		exp->_p++;
-
-
-		if(*exp->_p =='?') {
-			exp->_p++;
-			trex_expect(exp,':');
-			expr = trex_newnode(exp,OP_NOCAPEXPR);
-		}
-		else
-			expr = trex_newnode(exp,OP_EXPR);
-		newn = trex_list(exp);
-		exp->_nodes[expr].left = newn;
-		ret = expr;
-		trex_expect(exp,')');
-			  }
-			  break;
-	case '[':
-		exp->_p++;
-		ret = trex_class(exp);
-		trex_expect(exp,']');
-		break;
-	case TREX_SYMBOL_END_OF_STRING: exp->_p++; ret = trex_newnode(exp,OP_EOL);break;
-	case TREX_SYMBOL_ANY_CHAR: exp->_p++; ret = trex_newnode(exp,OP_DOT);break;
-	default:
-		ret = trex_charnode(exp,TRex_False);
-		break;
-	}
-
-	{
-		int op;
-		TRexBool isgreedy = TRex_False;
-		unsigned short p0 = 0, p1 = 0;
-		switch(*exp->_p){
-			case TREX_SYMBOL_GREEDY_ZERO_OR_MORE: p0 = 0; p1 = 0xFFFF; exp->_p++; isgreedy = TRex_True; break;
-			case TREX_SYMBOL_GREEDY_ONE_OR_MORE: p0 = 1; p1 = 0xFFFF; exp->_p++; isgreedy = TRex_True; break;
-			case TREX_SYMBOL_GREEDY_ZERO_OR_ONE: p0 = 0; p1 = 1; exp->_p++; isgreedy = TRex_True; break;
-			case '{':
-				exp->_p++;
-				if(!isdigit(*exp->_p)) trex_error(exp,_SC("number expected"));
-				p0 = (unsigned short)trex_parsenumber(exp);
-				/*******************************/
-				switch(*exp->_p) {
-			case '}':
-				p1 = p0; exp->_p++;
-				break;
-			case ',':
-				exp->_p++;
-				p1 = 0xFFFF;
-				if(isdigit(*exp->_p)){
-					p1 = (unsigned short)trex_parsenumber(exp);
-				}
-				trex_expect(exp,'}');
-				break;
-			default:
-				trex_error(exp,_SC(", or } expected"));
-		}
-		/*******************************/
-		isgreedy = TRex_True;
-		break;
-
-		}
-		if(isgreedy) {
-			int nnode = trex_newnode(exp,OP_GREEDY);
-			op = OP_GREEDY;
-			exp->_nodes[nnode].left = ret;
-			exp->_nodes[nnode].right = ((p0)<<16)|p1;
-			ret = nnode;
-		}
-	}
-	if((*exp->_p != TREX_SYMBOL_BRANCH) && (*exp->_p != ')') && (*exp->_p != TREX_SYMBOL_GREEDY_ZERO_OR_MORE) && (*exp->_p != TREX_SYMBOL_GREEDY_ONE_OR_MORE) && (*exp->_p != '\0')) {
-		int nnode = trex_element(exp);
-		exp->_nodes[ret].next = nnode;
-	}
-
-	return ret;
-}
-
-static int trex_list(TRex *exp)
-{
-	int ret=-1,e;
-	if(*exp->_p == TREX_SYMBOL_BEGINNING_OF_STRING) {
-		exp->_p++;
-		ret = trex_newnode(exp,OP_BOL);
-	}
-	e = trex_element(exp);
-	if(ret != -1) {
-		exp->_nodes[ret].next = e;
-	}
-	else ret = e;
-
-	if(*exp->_p == TREX_SYMBOL_BRANCH) {
-		int temp,tright;
-		exp->_p++;
-		temp = trex_newnode(exp,OP_OR);
-		exp->_nodes[temp].left = ret;
-		tright = trex_list(exp);
-		exp->_nodes[temp].right = tright;
-		ret = temp;
-	}
-	return ret;
-}
-
-static TRexBool trex_matchcclass(int cclass,TRexChar c)
-{
-	switch(cclass) {
-	case 'a': return isalpha(c)?TRex_True:TRex_False;
-	case 'A': return !isalpha(c)?TRex_True:TRex_False;
-	case 'w': return (isalnum(c) || c == '_')?TRex_True:TRex_False;
-	case 'W': return (!isalnum(c) && c != '_')?TRex_True:TRex_False;
-	case 's': return isspace(c)?TRex_True:TRex_False;
-	case 'S': return !isspace(c)?TRex_True:TRex_False;
-	case 'd': return isdigit(c)?TRex_True:TRex_False;
-	case 'D': return !isdigit(c)?TRex_True:TRex_False;
-	case 'x': return isxdigit(c)?TRex_True:TRex_False;
-	case 'X': return !isxdigit(c)?TRex_True:TRex_False;
-	case 'c': return iscntrl(c)?TRex_True:TRex_False;
-	case 'C': return !iscntrl(c)?TRex_True:TRex_False;
-	case 'p': return ispunct(c)?TRex_True:TRex_False;
-	case 'P': return !ispunct(c)?TRex_True:TRex_False;
-	case 'l': return islower(c)?TRex_True:TRex_False;
-	case 'u': return isupper(c)?TRex_True:TRex_False;
-	}
-	return TRex_False; /*cannot happen*/
-}
-
-static TRexBool trex_matchclass(TRex* exp,TRexNode *node,TRexChar c)
-{
-	do {
-		switch(node->type) {
-			case OP_RANGE:
-				if(c >= node->left && c <= node->right) return TRex_True;
-				break;
-			case OP_CCLASS:
-				if(trex_matchcclass(node->left,c)) return TRex_True;
-				break;
-			default:
-				if(c == node->type)return TRex_True;
-		}
-	} while((node->next != -1) && (node = &exp->_nodes[node->next]));
-	return TRex_False;
-}
-
-static const TRexChar *trex_matchnode(TRex* exp,TRexNode *node,const TRexChar *str,TRexNode *next)
-{
-
-	TRexNodeType type = node->type;
-	switch(type) {
-	case OP_GREEDY: {
-		//TRexNode *greedystop = (node->next != -1) ? &exp->_nodes[node->next] : NULL;
-		TRexNode *greedystop = NULL;
-		int p0 = (node->right >> 16)&0x0000FFFF, p1 = node->right&0x0000FFFF, nmaches = 0;
-		const TRexChar *s=str, *good = str;
-
-		if(node->next != -1) {
-			greedystop = &exp->_nodes[node->next];
-		}
-		else {
-			greedystop = next;
-		}
-
-		while((nmaches == 0xFFFF || nmaches < p1)) {
-
-			const TRexChar *stop;
-			if(!(s = trex_matchnode(exp,&exp->_nodes[node->left],s,greedystop)))
-				break;
-			nmaches++;
-			good=s;
-			if(greedystop) {
-				//checks that 0 matches satisfy the expression(if so skips)
-				//if not would always stop(for instance if is a '?')
-				if(greedystop->type != OP_GREEDY ||
-				(greedystop->type == OP_GREEDY && ((greedystop->right >> 16)&0x0000FFFF) != 0))
-				{
-					TRexNode *gnext = NULL;
-					if(greedystop->next != -1) {
-						gnext = &exp->_nodes[greedystop->next];
-					}else if(next && next->next != -1){
-						gnext = &exp->_nodes[next->next];
-					}
-					stop = trex_matchnode(exp,greedystop,s,gnext);
-					if(stop) {
-						//if satisfied stop it
-						if(p0 == p1 && p0 == nmaches) break;
-						else if(nmaches >= p0 && p1 == 0xFFFF) break;
-						else if(nmaches >= p0 && nmaches <= p1) break;
-					}
-				}
-			}
-
-			if(s >= exp->_eol)
-				break;
-		}
-		if(p0 == p1 && p0 == nmaches) return good;
-		else if(nmaches >= p0 && p1 == 0xFFFF) return good;
-		else if(nmaches >= p0 && nmaches <= p1) return good;
-		return NULL;
-	}
-	case OP_OR: {
-			const TRexChar *asd = str;
-			TRexNode *temp=&exp->_nodes[node->left];
-			while( (asd = trex_matchnode(exp,temp,asd,NULL)) ) {
-				if(temp->next != -1)
-					temp = &exp->_nodes[temp->next];
-				else
-					return asd;
-			}
-			asd = str;
-			temp = &exp->_nodes[node->right];
-			while( (asd = trex_matchnode(exp,temp,asd,NULL)) ) {
-				if(temp->next != -1)
-					temp = &exp->_nodes[temp->next];
-				else
-					return asd;
-			}
-			return NULL;
-			break;
-	}
-	case OP_EXPR:
-	case OP_NOCAPEXPR:{
-			TRexNode *n = &exp->_nodes[node->left];
-			const TRexChar *cur = str;
-			int capture = -1;
-			if(node->type != OP_NOCAPEXPR && node->right == exp->_currsubexp) {
-				capture = exp->_currsubexp;
-				exp->_matches[capture].begin = cur;
-				exp->_currsubexp++;
-			}
-
-			do {
-				TRexNode *subnext = NULL;
-				if(n->next != -1) {
-					subnext = &exp->_nodes[n->next];
-				}else {
-					subnext = next;
-				}
-				if(!(cur = trex_matchnode(exp,n,cur,subnext))) {
-					if(capture != -1){
-						exp->_matches[capture].begin = 0;
-						exp->_matches[capture].len = 0;
-					}
-					return NULL;
-				}
-			} while((n->next != -1) && (n = &exp->_nodes[n->next]));
-
-			if(capture != -1)
-				exp->_matches[capture].len = cur - exp->_matches[capture].begin;
-			return cur;
-	}
-	case OP_WB:
-		if(str == exp->_bol && !isspace(*str)
-		 || (str == exp->_eol && !isspace(*(str-1)))
-		 || (!isspace(*str) && isspace(*(str+1)))
-		 || (isspace(*str) && !isspace(*(str+1))) ) {
-			return (node->left == 'b')?str:NULL;
-		}
-		return (node->left == 'b')?NULL:str;
-	case OP_BOL:
-		if(str == exp->_bol) return str;
-		return NULL;
-	case OP_EOL:
-		if(str == exp->_eol) return str;
-		return NULL;
-	case OP_DOT:{
-		*str++;
-				}
-		return str;
-	case OP_NCLASS:
-	case OP_CLASS:
-		if(trex_matchclass(exp,&exp->_nodes[node->left],*str)?(type == OP_CLASS?TRex_True:TRex_False):(type == OP_NCLASS?TRex_True:TRex_False)) {
-			*str++;
-			return str;
-		}
-		return NULL;
-	case OP_CCLASS:
-		if(trex_matchcclass(node->left,*str)) {
-			*str++;
-			return str;
-		}
-		return NULL;
-	default: /* char */
-		if(*str != node->type) return NULL;
-		*str++;
-		return str;
-	}
-	return NULL;
-}
-
-/* public api */
-TRex *trex_compile(const TRexChar *pattern,const TRexChar **error)
-{
-	TRex *exp = (TRex *)malloc(sizeof(TRex));
-	exp->_eol = exp->_bol = NULL;
-	exp->_p = pattern;
-	exp->_nallocated = (int)scstrlen(pattern) * sizeof(TRexChar);
-	exp->_nodes = (TRexNode *)malloc(exp->_nallocated * sizeof(TRexNode));
-	exp->_nsize = 0;
-	exp->_matches = 0;
-	exp->_nsubexpr = 0;
-	exp->_first = trex_newnode(exp,OP_EXPR);
-	exp->_error = error;
-	exp->_jmpbuf = malloc(sizeof(jmp_buf));
-	if(setjmp(*((jmp_buf*)exp->_jmpbuf)) == 0) {
-		int res = trex_list(exp);
-		exp->_nodes[exp->_first].left = res;
-		if(*exp->_p!='\0')
-			trex_error(exp,_SC("unexpected character"));
-#ifdef _DEBUG
-		{
-			int nsize,i;
-			TRexNode *t;
-			nsize = exp->_nsize;
-			t = &exp->_nodes[0];
-			scprintf(_SC("\n"));
-			for(i = 0;i < nsize; i++) {
-				if(exp->_nodes[i].type>MAX_CHAR)
-					scprintf(_SC("[%02d] %10s "),i,g_nnames[exp->_nodes[i].type-MAX_CHAR]);
-				else
-					scprintf(_SC("[%02d] %10c "),i,exp->_nodes[i].type);
-				scprintf(_SC("left %02d right %02d next %02d\n"),exp->_nodes[i].left,exp->_nodes[i].right,exp->_nodes[i].next);
-			}
-			scprintf(_SC("\n"));
-		}
-#endif
-		exp->_matches = (TRexMatch *) malloc(exp->_nsubexpr * sizeof(TRexMatch));
-		memset(exp->_matches,0,exp->_nsubexpr * sizeof(TRexMatch));
-	}
-	else{
-		trex_free(exp);
-		return NULL;
-	}
-	return exp;
-}
-
-void trex_free(TRex *exp)
-{
-	if(exp)	{
-		if(exp->_nodes) free(exp->_nodes);
-		if(exp->_jmpbuf) free(exp->_jmpbuf);
-		if(exp->_matches) free(exp->_matches);
-		free(exp);
-	}
-}
-
-TRexBool trex_match(TRex* exp,const TRexChar* text)
-{
-	const TRexChar* res = NULL;
-	exp->_bol = text;
-	exp->_eol = text + scstrlen(text);
-	exp->_currsubexp = 0;
-	res = trex_matchnode(exp,exp->_nodes,text,NULL);
-
-	#ifdef _DEBUG
-		scprintf("DEBUG trex_match: res = '%s'\n", res);
-		scprintf("DEBUG trex_match: exp->_eol = '%s'\n", exp->_eol);
-	#endif
-
-	// Fail match if trex_matchnode returns nothing
-	if (!res) {
-		return TRex_False;
-	}
-
-	return TRex_True;
-}
-
-TRexBool trex_searchrange(TRex* exp,const TRexChar* text_begin,const TRexChar* text_end,const TRexChar** out_begin, const TRexChar** out_end)
-{
-	const TRexChar *cur = NULL;
-	int node = exp->_first;
-	if(text_begin >= text_end) return TRex_False;
-	exp->_bol = text_begin;
-	exp->_eol = text_end;
-	do {
-		cur = text_begin;
-		while(node != -1) {
-			exp->_currsubexp = 0;
-			cur = trex_matchnode(exp,&exp->_nodes[node],cur,NULL);
-			if(!cur)
-				break;
-			node = exp->_nodes[node].next;
-		}
-		*text_begin++;
-	} while(cur == NULL && text_begin != text_end);
-
-	if(cur == NULL)
-		return TRex_False;
-
-	--text_begin;
-
-	if(out_begin) *out_begin = text_begin;
-	if(out_end) *out_end = cur;
-	return TRex_True;
-}
-
-TRexBool trex_search(TRex* exp,const TRexChar* text, const TRexChar** out_begin, const TRexChar** out_end)
-{
-	return trex_searchrange(exp,text,text + scstrlen(text),out_begin,out_end);
-}
-
-int trex_getsubexpcount(TRex* exp)
-{
-	return exp->_nsubexpr;
-}
-
-TRexBool trex_getsubexp(TRex* exp, int n, TRexMatch *subexp)
-{
-	if( n<0 || n >= exp->_nsubexpr) return TRex_False;
-	*subexp = exp->_matches[n];
-	return TRex_True;
-}
-

D vendor/T-Rex/readme.txt

@@ -1,171 +0,0 @@ 
-T-REX 1.3 http://tiny-rex.sourceforge.net
-----------------------------------------------------------------------
-	T-Rex a tiny regular expression library
-
-	Copyright (C) 2003-2006 Alberto Demichelis
-
-	This software is provided 'as-is', without any express 
-	or implied warranty. In no event will the authors be held 
-	liable for any damages arising from the use of this software.
-
-	Permission is granted to anyone to use this software for 
-	any purpose, including commercial applications, and to alter
-	it and redistribute it freely, subject to the following restrictions:
-
-		1. The origin of this software must not be misrepresented;
-		you must not claim that you wrote the original software.
-		If you use this software in a product, an acknowledgment
-		in the product documentation would be appreciated but
-		is not required.
-
-		2. Altered source versions must be plainly marked as such,
-		and must not be misrepresented as being the original software.
-
-		3. This notice may not be removed or altered from any
-		source distribution.
-		
-----------------------------------------------------------------------
-TRex implements the following expressions
-
-\	Quote the next metacharacter
-^	Match the beginning of the string
-.	Match any character
-$	Match the end of the string
-|	Alternation
-()	Grouping (creates a capture)
-[]	Character class  
-
-==GREEDY CLOSURES==
-*	   Match 0 or more times
-+	   Match 1 or more times
-?	   Match 1 or 0 times
-{n}    Match exactly n times
-{n,}   Match at least n times
-{n,m}  Match at least n but not more than m times  
-
-==ESCAPE CHARACTERS==
-\t		tab                   (HT, TAB)
-\n		newline               (LF, NL)
-\r		return                (CR)
-\f		form feed             (FF)
-
-==PREDEFINED CLASSES==
-\l		lowercase next char
-\u		uppercase next char
-\a		letters
-\A		non letters
-\w		alphanimeric [0-9a-zA-Z]
-\W		non alphanimeric
-\s		space
-\S		non space
-\d		digits
-\D		non nondigits
-\x		exadecimal digits
-\X		non exadecimal digits
-\c		control charactrs
-\C		non control charactrs
-\p		punctation
-\P		non punctation
-\b		word boundary
-\B		non word boundary
-
-----------------------------------------------------------------------
-API DOC
-----------------------------------------------------------------------
-TRex *trex_compile(const TRexChar *pattern,const TRexChar **error);
-
-compiles an expression and returns a pointer to the compiled version.
-in case of failure returns NULL.The returned object has to be deleted
-through the function trex_free().
-
-pattern
-	a pointer to a zero terminated string containing the pattern that 
-	has to be compiled.
-error
-	apointer to a string pointer that will be set with an error string
-	in case of failure.
-	
-----------------------------------------------------------------------
-void trex_free(TRex *exp)
-
-deletes a expression structure created with trex_compile()
-
-exp
-	the expression structure that has to be deleted
-
-----------------------------------------------------------------------
-TRexBool trex_match(TRex* exp,const TRexChar* text)
-
-returns TRex_True if the string specified in the parameter text is an
-exact match of the expression, otherwise returns TRex_False.
-
-exp
-	the compiled expression
-text
-	the string that has to be tested
-	
-----------------------------------------------------------------------
-TRexBool trex_search(TRex* exp,const TRexChar* text, const TRexChar** out_begin, const TRexChar** out_end)
-
-searches the first match of the expressin in the string specified in the parameter text.
-if the match is found returns TRex_True and the sets out_begin to the beginning of the
-match and out_end at the end of the match; otherwise returns TRex_False.
-
-exp
-	the compiled expression
-text
-	the string that has to be tested
-out_begin
-	a pointer to a string pointer that will be set with the beginning of the match
-out_end
-	a pointer to a string pointer that will be set with the end of the match
-
-----------------------------------------------------------------------
-TREX_API TRexBool trex_searchrange(TRex* exp,const TRexChar* text_begin,const TRexChar* text_end,const TRexChar** out_begin, const TRexChar** out_end)
-
-searches the first match of the expressin in the string delimited 
-by the parameter text_begin and text_end.
-if the match is found returns TRex_True and the sets out_begin to the beginning of the
-match and out_end at the end of the match; otherwise returns TRex_False.
-
-exp
-	the compiled expression
-text_begin
-	a pointer to the beginnning of the string that has to be tested
-text_end
-	a pointer to the end of the string that has to be tested
-out_begin
-	a pointer to a string pointer that will be set with the beginning of the match
-out_end
-	a pointer to a string pointer that will be set with the end of the match
-	
-----------------------------------------------------------------------
-int trex_getsubexpcount(TRex* exp)
-
-returns the number of sub expressions matched by the expression
-
-exp
-	the compiled expression
-
----------------------------------------------------------------------
-TRexBool trex_getsubexp(TRex* exp, int n, TRexMatch *submatch)
-
-retrieve the begin and and pointer to the length of the sub expression indexed
-by n. The result is passed trhough the struct TRexMatch:
-
-typedef struct {
-	const TRexChar *begin;
-	int len;
-} TRexMatch;
-
-the function returns TRex_True if n is valid index otherwise TRex_False.
-
-exp
-	the compiled expression
-n
-	the index of the submatch
-submatch
-	a pointer to structure that will store the result
-	
-this function works also after a match operation has been performend.
-

D vendor/T-Rex/trex.h

@@ -1,67 +0,0 @@ 
-#ifndef _TREX_H_
-#define _TREX_H_
-/***************************************************************
-	T-Rex a tiny regular expression library
-
-	Copyright (C) 2003-2006 Alberto Demichelis
-
-	This software is provided 'as-is', without any express 
-	or implied warranty. In no event will the authors be held 
-	liable for any damages arising from the use of this software.
-
-	Permission is granted to anyone to use this software for 
-	any purpose, including commercial applications, and to alter
-	it and redistribute it freely, subject to the following restrictions:
-
-		1. The origin of this software must not be misrepresented;
-		you must not claim that you wrote the original software.
-		If you use this software in a product, an acknowledgment
-		in the product documentation would be appreciated but
-		is not required.
-
-		2. Altered source versions must be plainly marked as such,
-		and must not be misrepresented as being the original software.
-
-		3. This notice may not be removed or altered from any
-		source distribution.
-
-****************************************************************/
-
-//#ifdef _UNICODE
-//#define TRexChar unsigned short
-//#define MAX_CHAR 0xFFFF
-//#define _TREXC(c) L##c 
-//#define trex_strlen wcslen
-//#define trex_printf wprintf
-//#else
-#define TRexChar char
-#define MAX_CHAR 0xFF
-#define _TREXC(c) (c) 
-#define trex_strlen strlen
-#define trex_printf printf
-//#endif
-
-#ifndef extern
-#define extern extern
-#endif
-
-#define TRex_True 1
-#define TRex_False 0
-
-typedef unsigned int TRexBool;
-typedef struct TRex TRex;
-
-typedef struct {
-	const TRexChar *begin;
-	int len;
-} TRexMatch;
-
-extern TRex *trex_compile(const TRexChar *pattern,const TRexChar **error);
-extern void trex_free(TRex *exp);
-extern TRexBool trex_match(TRex* exp,const TRexChar* text);
-extern TRexBool trex_search(TRex* exp,const TRexChar* text, const TRexChar** out_begin, const TRexChar** out_end);
-extern TRexBool trex_searchrange(TRex* exp,const TRexChar* text_begin,const TRexChar* text_end,const TRexChar** out_begin, const TRexChar** out_end);
-extern int trex_getsubexpcount(TRex* exp);
-extern TRexBool trex_getsubexp(TRex* exp, int n, TRexMatch *subexp);
-
-#endif

A vendor/slre.nim

@@ -0,0 +1,125 @@ 
+#
+#  Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
+#  All rights reserved
+# 
+#  "THE BEER-WARE LICENSE" (Revision 42):
+#  Sergey Lyubka wrote this file.  As long as you retain this notice you
+#  can do whatever you want with this stuff. If we meet some day, and you think
+#  this stuff is worth it, you can buy me a beer in return.
+# 
+#
+#  This is a regular expression library that implements a subset of Perl RE.
+#  Please refer to http://slre.sourceforge.net for detailed description.
+# 
+#  Usage example (parsing HTTP request):
+# 
+#  struct slre	slre;
+#  struct cap	captures[4 + 1];  // Number of braket pairs + 1
+#  ...
+# 
+#  slre_compile(&slre,"^(GET|POST) (\S+) HTTP/(\S+?)\r\n");
+# 
+#  if (slre_match(&slre, buf, len, captures)) {
+# 	printf("Request line length: %d\n", captures[0].len);
+# 	printf("Method: %.*s\n", captures[1].len, captures[1].ptr);
+# 	printf("URI: %.*s\n", captures[2].len, captures[2].ptr);
+#  }
+# 
+#  Supported syntax:
+# 	^		Match beginning of a buffer
+# 	$		Match end of a buffer
+# 	()		Grouping and substring capturing
+# 	[...]		Match any character from set
+# 	[^...]		Match any character but ones from set
+# 	\s		Match whitespace
+# 	\S		Match non-whitespace
+# 	\d		Match decimal digit
+# 	\r		Match carriage return
+# 	\n		Match newline
+# 	+		Match one or more times (greedy)
+# 	+?		Match one or more times (non-greedy)
+# 	*		Match zero or more times (greedy)
+# 	*?		Match zero or more times (non-greedy)
+# 	?		Match zero or once
+# 	\xDD		Match byte with hex value 0xDD
+# 	\meta		Match one of the meta character: ^$().[*+?\
+# 
+
+{.compile: "vendor/slre/libslre.c".}
+#
+#  Compiled regular expression
+# 
+type 
+  slre* = object 
+    code*: array[256, cuchar]
+    data*: array[256, cuchar]
+    code_size*: cint
+    data_size*: cint
+    num_caps*: cint         # Number of bracket pairs	
+    anchored*: cint         # Must match from string start	
+    err_str*: cstring       # Error string			
+  
+#
+#  Captured substring
+# 
+type 
+  cap* = object 
+    value*: cstring           # Pointer to the substring	
+    len*: cint              # Substring length		
+
+#
+#  Compile regular expression. If success, 1 is returned.
+#  If error, 0 is returned and slre.err_str points to the error message. 
+# 
+proc slre_compile(a2: ptr slre; re: cstring): cint {.importc.}
+#
+#  Return 1 if match, 0 if no match. 
+#  If `captured_substrings' array is not NULL, then it is filled with the
+#  values of captured substrings. captured_substrings[0] element is always
+#  a full matched substring. The round bracket captures start from
+#  captured_substrings[1].
+#  It is assumed that the size of captured_substrings array is enough to
+#  hold all captures. The caller function must make sure it is! So, the
+#  array_size = number_of_round_bracket_pairs + 1
+# 
+proc slre_match(a2: ptr slre; buf: cstring; buf_len: cint; 
+                 captured_substrings: openarray[cap]): cint {.importc.}
+
+# High level API
+from strutils import contains, replace, parseInt
+from sequtils import delete
+
+proc match*(s: string, re: string): seq[string] =
+  var rawre = cast[ptr slre](alloc0(sizeof(slre)))
+  if slre_compile(rawre, re) == 1:
+    var matches:array[10, cap]
+    if rawre.slre_match(s.cstring, s.len.cint, matches) == 1:
+      var res = newSeq[string](0)
+      for i in items(matches):
+        if i.value != nil:
+          var str = $(i.value)
+          res.add str.substr(0, i.len-1)
+      return res
+    else:
+      return newSeq[string](0)
+  else:
+    raise newException(EInvalidValue, $(rawre.err_str))
+
+proc gsub*(s_find: string, re: string, s_replace): string =
+  var matches = s_find.match(re)
+  if matches.len > 0:
+    var res = s_find.replace(matches[0], s_replace)
+    if matches.len > 1:
+      # Replace captures
+      var caps = res.match("\\$(\\d)")
+      if caps.len > 1:
+        # Remove first (global) match
+        caps.delete(0, 0)
+        for c in caps:
+          var ci = parseInt(c)
+          # Replace $-placeholders with captures
+          while res.contains("$"&c):
+            res = res.replace("$"&c, matches[ci])
+    return res
+  else:
+    return s_find

A vendor/slre/libslre.c

@@ -0,0 +1,667 @@ 
+/*
+ * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
+ * All rights reserved
+ *
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * Sergey Lyubka wrote this file.  As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "slre.h"
+
+enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
+	STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
+
+static struct {
+	const char	*name;
+	int		narg;
+	const char	*flags;	
+} opcodes[] = {
+	{"END",		0, ""},		/* End of code block or program	*/
+	{"BRANCH",	2, "oo"},	/* Alternative operator, "|"	*/
+	{"ANY",		0, ""},		/* Match any character, "."	*/
+	{"EXACT",	2, "d"},	/* Match exact string		*/
+	{"ANYOF",	2, "D"},	/* Match any from set, "[]"	*/
+	{"ANYBUT",	2, "D"},	/* Match any but from set, "[^]"*/
+	{"OPEN ",	1, "i"},	/* Capture start, "("		*/
+	{"CLOSE",	1, "i"},	/* Capture end, ")"		*/
+	{"BOL",		0, ""},		/* Beginning of string, "^"	*/
+	{"EOL",		0, ""},		/* End of string, "$"		*/
+	{"STAR",	1, "o"},	/* Match zero or more times "*"	*/
+	{"PLUS",	1, "o"},	/* Match one or more times, "+"	*/
+	{"STARQ",	1, "o"},	/* Non-greedy STAR,  "*?"	*/
+	{"PLUSQ",	1, "o"},	/* Non-greedy PLUS, "+?"	*/
+	{"QUEST",	1, "o"},	/* Match zero or one time, "?"	*/
+	{"SPACE",	0, ""},		/* Match whitespace, "\s"	*/
+	{"NONSPACE",	0, ""},		/* Match non-space, "\S"	*/
+	{"DIGIT",	0, ""}		/* Match digit, "\d"		*/
+};
+
+/*
+ * Commands and operands are all unsigned char (1 byte long). All code offsets
+ * are relative to current address, and positive (always point forward). Data
+ * offsets are absolute. Commands with operands:
+ *
+ * BRANCH offset1 offset2
+ *	Try to match the code block that follows the BRANCH instruction
+ *	(code block ends with END). If no match, try to match code block that
+ *	starts at offset1. If either of these match, jump to offset2.
+ *
+ * EXACT data_offset data_length
+ *	Try to match exact string. String is recorded in data section from
+ *	data_offset, and has length data_length.
+ *
+ * OPEN capture_number
+ * CLOSE capture_number
+ *	If the user have passed 'struct cap' array for captures, OPEN
+ *	records the beginning of the matched substring (cap->ptr), CLOSE
+ *	sets the length (cap->len) for respective capture_number.
+ *
+ * STAR code_offset
+ * PLUS code_offset
+ * QUEST code_offset
+ *	*, +, ?, respectively. Try to gobble as much as possible from the
+ *	matched buffer, until code block that follows these instructions
+ *	matches. When the longest possible string is matched,
+ *	jump to code_offset
+ *
+ * STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
+ */
+
+static const char *meta_chars = "|.^$*+?()[\\";
+
+static void
+print_character_set(FILE *fp, const unsigned char *p, int len)
+{
+	int	i;
+
+	for (i = 0; i < len; i++) {
+		if (i > 0)
+			(void) fputc(',', fp);
+		if (p[i] == 0) {
+			i++;
+			if (p[i] == 0)
+				(void) fprintf(fp, "\\x%02x", p[i]);
+			else
+				(void) fprintf(fp, "%s", opcodes[p[i]].name);
+		} else if (isprint(p[i])) {
+			(void) fputc(p[i], fp);
+		} else {
+			(void) fprintf(fp,"\\x%02x", p[i]);
+		}
+	}
+}
+
+void
+slre_dump(const struct slre *r, FILE *fp)
+{
+	int	i, j, ch, op, pc;
+
+	for (pc = 0; pc < r->code_size; pc++) {
+
+		op = r->code[pc];
+		(void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
+
+		for (i = 0; opcodes[op].flags[i] != '\0'; i++)
+			switch (opcodes[op].flags[i]) {
+			case 'i':
+				(void) fprintf(fp, "%d ", r->code[pc + 1]);
+				pc++;
+				break;
+			case 'o':
+				(void) fprintf(fp, "%d ",
+				    pc + r->code[pc + 1] - i);
+				pc++;
+				break;
+			case 'D':
+				print_character_set(fp, r->data +
+				    r->code[pc + 1], r->code[pc + 2]);
+				pc += 2;
+				break;
+			case 'd':
+				(void) fputc('"', fp);
+				for (j = 0; j < r->code[pc + 2]; j++) {
+					ch = r->data[r->code[pc + 1] + j];
+					if (isprint(ch))
+						(void) fputc(ch, fp);
+					else
+						(void) fprintf(fp,"\\x%02x",ch);
+				}
+				(void) fputc('"', fp);
+				pc += 2;
+				break;
+			}
+
+		(void) fputc('\n', fp);
+	}
+}
+
+static void
+set_jump_offset(struct slre *r, int pc, int offset)
+{
+	assert(offset < r->code_size);
+
+	if (r->code_size - offset > 0xff) {
+		r->err_str = "Jump offset is too big";
+	} else {
+		r->code[pc] = (unsigned char) (r->code_size - offset);
+	}
+}
+
+static void
+emit(struct slre *r, int code)
+{
+	if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
+		r->err_str = "RE is too long (code overflow)";
+	else
+		r->code[r->code_size++] = (unsigned char) code;
+}
+
+static void
+store_char_in_data(struct slre *r, int ch)
+{
+	if (r->data_size >= (int) sizeof(r->data))
+		r->err_str = "RE is too long (data overflow)";
+	else
+		r->data[r->data_size++] = ch;
+}
+
+static void
+exact(struct slre *r, const char **re)
+{
+	int	old_data_size = r->data_size;
+
+	while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
+		store_char_in_data(r, *(*re)++);
+
+	emit(r, EXACT);
+	emit(r, old_data_size);
+	emit(r, r->data_size - old_data_size);
+}
+
+static int
+get_escape_char(const char **re)
+{
+	int	res;
+
+	switch (*(*re)++) {
+	case 'n':	res = '\n';		break;
+	case 'r':	res = '\r';		break;
+	case 't':	res = '\t';		break;
+	case '0':	res = 0;		break;
+	case 'S':	res = NONSPACE << 8;	break;
+	case 's':	res = SPACE << 8;	break;
+	case 'd':	res = DIGIT << 8;	break;
+	default:	res = (*re)[-1];	break;
+	}
+
+	return (res);
+}
+
+static void
+anyof(struct slre *r, const char **re)
+{
+	int	esc, old_data_size = r->data_size, op = ANYOF;
+
+	if (**re == '^') {
+		op = ANYBUT;
+		(*re)++;
+	}
+
+	while (**re != '\0')
+
+		switch (*(*re)++) {
+		case ']':
+			emit(r, op);
+			emit(r, old_data_size);
+			emit(r, r->data_size - old_data_size);
+			return;
+			/* NOTREACHED */
+			break;
+		case '\\':
+			esc = get_escape_char(re);
+			if ((esc & 0xff) == 0) {
+				store_char_in_data(r, 0);
+				store_char_in_data(r, esc >> 8);
+			} else {
+				store_char_in_data(r, esc);
+			}
+			break;
+		default:
+			store_char_in_data(r, (*re)[-1]);
+			break;
+		}
+
+	r->err_str = "No closing ']' bracket";
+}
+
+static void
+relocate(struct slre *r, int begin, int shift)
+{
+	emit(r, END);
+	memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
+	r->code_size += shift;
+}
+
+static void
+quantifier(struct slre *r, int prev, int op)
+{
+	if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
+		r->code[prev + 2]--;
+		emit(r, EXACT);
+		emit(r, r->code[prev + 1] + r->code[prev + 2]);
+		emit(r, 1);
+		prev = r->code_size - 3;
+	}
+	relocate(r, prev, 2);
+	r->code[prev] = op;
+	set_jump_offset(r, prev + 1, prev);
+}
+
+static void
+exact_one_char(struct slre *r, int ch)
+{
+	emit(r, EXACT);
+	emit(r, r->data_size);
+	emit(r, 1);
+	store_char_in_data(r, ch);
+}
+
+static void
+fixup_branch(struct slre *r, int fixup)
+{
+	if (fixup > 0) {
+		emit(r, END);
+		set_jump_offset(r, fixup, fixup - 2);
+	}
+}
+
+static void
+compile(struct slre *r, const char **re)
+{
+	int	op, esc, branch_start, last_op, fixup, cap_no, level;
+
+	fixup = 0;
+	level = r->num_caps;
+	branch_start = last_op = r->code_size;
+
+	for (;;)
+		switch (*(*re)++) {
+		case '\0':
+			(*re)--;
+			return;
+			/* NOTREACHED */
+			break;
+		case '^':
+			emit(r, BOL);
+			break;
+		case '$':
+			emit(r, EOL);
+			break;
+		case '.':
+			last_op = r->code_size;
+			emit(r, ANY);
+			break;
+		case '[':
+			last_op = r->code_size;
+			anyof(r, re);
+			break;
+		case '\\':
+			last_op = r->code_size;
+			esc = get_escape_char(re);
+			if (esc & 0xff00) {
+				emit(r, esc >> 8);
+			} else {
+				exact_one_char(r, esc);
+			}
+			break;
+		case '(':
+			last_op = r->code_size;
+			cap_no = ++r->num_caps;
+			emit(r, OPEN);
+			emit(r, cap_no);
+
+			compile(r, re);
+			if (*(*re)++ != ')') {
+				r->err_str = "No closing bracket";
+				return;
+			}
+
+			emit(r, CLOSE);
+			emit(r, cap_no);
+			break;
+		case ')':
+			(*re)--;
+			fixup_branch(r, fixup);
+			if (level == 0) {
+				r->err_str = "Unbalanced brackets";
+				return;
+			}
+			return;
+			/* NOTREACHED */
+			break;
+		case '+':
+		case '*':
+			op = (*re)[-1] == '*' ? STAR: PLUS;
+			if (**re == '?') {
+				(*re)++;
+				op = op == STAR ? STARQ : PLUSQ;
+			}
+			quantifier(r, last_op, op);
+			break;
+		case '?':
+			quantifier(r, last_op, QUEST);
+			break;
+		case '|':
+			fixup_branch(r, fixup);
+			relocate(r, branch_start, 3);
+			r->code[branch_start] = BRANCH;
+			set_jump_offset(r, branch_start + 1, branch_start);
+			fixup = branch_start + 2;
+			r->code[fixup] = 0xff;
+			break;
+		default:
+			(*re)--;
+			last_op = r->code_size;
+			exact(r, re);
+			break;
+		}
+}
+
+int
+slre_compile(struct slre *r, const char *re)
+{
+	r->err_str = NULL;
+	r->code_size = r->data_size = r->num_caps = r->anchored = 0;
+
+	if (*re == '^')
+		r->anchored++;
+
+	emit(r, OPEN);	/* This will capture what matches full RE */
+	emit(r, 0);
+
+	while (*re != '\0')
+		compile(r, &re);
+
+	if (r->code[2] == BRANCH)
+		fixup_branch(r, 4);
+
+	emit(r, CLOSE);
+	emit(r, 0);
+	emit(r, END);
+
+	return (r->err_str == NULL ? 1 : 0);
+}
+
+static int match(const struct slre *, int,
+		const char *, int, int *, struct cap *);
+
+static void
+loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
+{
+	int	saved_offset, matched_offset;
+
+	saved_offset = matched_offset = *ofs;
+
+	while (match(r, pc + 2, s, len, ofs, NULL)) {
+		saved_offset = *ofs;
+		if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
+			matched_offset = saved_offset;
+		*ofs = saved_offset;
+	}
+
+	*ofs = matched_offset;
+}
+
+static void
+loop_non_greedy(const struct slre *r, int pc, const char *s,int len, int *ofs)
+{
+	int	saved_offset = *ofs;
+
+	while (match(r, pc + 2, s, len, ofs, NULL)) {
+		saved_offset = *ofs;
+		if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
+			break;
+	}
+
+	*ofs = saved_offset;
+}
+
+static int
+is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
+{
+	int	i, ch;
+
+	ch = s[*ofs];
+
+	for (i = 0; i < len; i++)
+		if (p[i] == ch) {
+			(*ofs)++;
+			return (1);
+		}
+
+	return (0);
+}
+
+static int
+is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
+{
+	int	i, ch;
+
+	ch = s[*ofs];
+
+	for (i = 0; i < len; i++)
+		if (p[i] == ch)
+			return (0);
+
+	(*ofs)++;
+	return (1);
+}
+
+static int
+match(const struct slre *r, int pc, const char *s, int len,
+		int *ofs, struct cap *caps)
+{
+	int	n, saved_offset, res = 1;
+
+	while (res && r->code[pc] != END) {
+
+		assert(pc < r->code_size);
+		assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
+
+		switch (r->code[pc]) {
+		case BRANCH:
+			saved_offset = *ofs;
+			res = match(r, pc + 3, s, len, ofs, caps);
+			if (res == 0) {
+				*ofs = saved_offset;
+				res = match(r, pc + r->code[pc + 1],
+				    s, len, ofs, caps);
+			}
+			pc += r->code[pc + 2]; 
+			break;
+		case EXACT:
+			res = 0;
+			n = r->code[pc + 2];	/* String length */
+			if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
+			    r->code[pc + 1], n)) {
+				(*ofs) += n;
+				res = 1;
+			}
+			pc += 3;
+			break;
+		case QUEST:
+			res = 1;
+			saved_offset = *ofs;
+			if (!match(r, pc + 2, s, len, ofs, caps))
+				*ofs = saved_offset;
+			pc += r->code[pc + 1];
+			break;
+		case STAR:
+			res = 1;
+			loop_greedy(r, pc, s, len, ofs);
+			pc += r->code[pc + 1];
+			break;
+		case STARQ:
+			res = 1;
+			loop_non_greedy(r, pc, s, len, ofs);
+			pc += r->code[pc + 1];
+			break;
+		case PLUS:
+			if ((res = match(r, pc + 2, s, len, ofs, caps)) == 0)
+				break;
+
+			loop_greedy(r, pc, s, len, ofs);
+			pc += r->code[pc + 1];
+			break;
+		case PLUSQ:
+			if ((res = match(r, pc + 2, s, len, ofs, caps)) == 0)
+				break;
+
+			loop_non_greedy(r, pc, s, len, ofs);
+			pc += r->code[pc + 1];
+			break;
+		case SPACE:
+			res = 0;
+			if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
+				(*ofs)++;
+				res = 1;
+			}
+			pc++;
+			break;
+		case NONSPACE:
+			res = 0;
+			if (*ofs <len && !isspace(((unsigned char *)s)[*ofs])) {
+				(*ofs)++;
+				res = 1;
+			}
+			pc++;
+			break;
+		case DIGIT:
+			res = 0;
+			if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
+				(*ofs)++;
+				res = 1;
+			}
+			pc++;
+			break;
+		case ANY:
+			res = 0;
+			if (*ofs < len) {
+				(*ofs)++;
+				res = 1;
+			}
+			pc++;
+			break;
+		case ANYOF:
+			res = 0;
+			if (*ofs < len)
+				res = is_any_of(r->data + r->code[pc + 1],
+					r->code[pc + 2], s, ofs);
+			pc += 3;
+			break;
+		case ANYBUT:
+			res = 0;
+			if (*ofs < len)
+				res = is_any_but(r->data + r->code[pc + 1],
+					r->code[pc + 2], s, ofs);
+			pc += 3;
+			break;
+		case BOL:
+			res = *ofs == 0 ? 1 : 0;
+			pc++;
+			break;
+		case EOL:
+			res = *ofs == len ? 1 : 0;
+			pc++;
+			break;
+		case OPEN:
+			if (caps != NULL)
+				caps[r->code[pc + 1]].ptr = s + *ofs;
+			pc += 2;
+			break;
+		case CLOSE:
+			if (caps != NULL)
+				caps[r->code[pc + 1]].len = (s + *ofs) -
+				    caps[r->code[pc + 1]].ptr;
+			pc += 2;
+			break;
+		case END:
+			pc++;
+			break;
+		default:
+			printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
+			assert(0);
+			break;
+		}
+	}
+
+	return (res);
+}
+
+int
+slre_match(const struct slre *r, const char *buf, int len,
+		struct cap *caps)
+{
+	int	i, ofs = 0, res = 0;
+
+	if (r->anchored) {
+		res = match(r, 0, buf, len, &ofs, caps);
+	} else {
+		for (i = 0; i < len && res == 0; i++) {
+			ofs = i;
+			res = match(r, 0, buf, len, &ofs, caps);
+		}
+	}
+
+	return (res);
+}
+
+#ifdef TEST
+int main(int argc, char *argv[])
+{
+	struct slre	slre;
+	struct cap	caps[20];
+	char		data[1 * 1024 * 1024];
+	FILE		*fp;
+	int		i, count, res, len;
+
+	if (argc < 3) {
+		printf("Usage: %s 'slre' <file> [count]\n", argv[0]);
+	} else if ((fp = fopen(argv[2], "rb")) == NULL) {
+		printf("Error: cannot open %s:%s\n", argv[2], strerror(errno));
+	} else if (!slre_compile(&slre, argv[1])) {
+		printf("Error compiling slre: %s\n", slre.err_str);
+	} else {
+		slre_dump(&slre, stderr);
+
+		(void) memset(caps, 0, sizeof(caps));
+
+		/* Read first 128K of file */
+		len = fread(data, 1, sizeof(data), fp);
+		(void) fclose(fp);
+
+		res = 0;
+		count = argc > 3 ? atoi(argv[3]) : 1;
+		for (i = 0; i < count; i++)
+			res = slre_match(&slre, data, len, caps);
+
+		printf("Result: %d\n", res);
+
+		for (i = 0; i < 20; i++)
+			if (caps[i].len > 0)
+				printf("Substring %d: [%.*s]\n", i,
+				    caps[i].len, caps[i].ptr);
+	}
+
+	return (0);
+}
+#endif /* TEST */

A vendor/slre/slre.h

@@ -0,0 +1,92 @@ 
+/*
+ * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
+ * All rights reserved
+ *
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * Sergey Lyubka wrote this file.  As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.
+ */
+
+/*
+ * This is a regular expression library that implements a subset of Perl RE.
+ * Please refer to http://slre.sourceforge.net for detailed description.
+ *
+ * Usage example (parsing HTTP request):
+ *
+ * struct slre	slre;
+ * struct cap	captures[4 + 1];  // Number of braket pairs + 1
+ * ...
+ *
+ * slre_compile(&slre,"^(GET|POST) (\S+) HTTP/(\S+?)\r\n");
+ *
+ * if (slre_match(&slre, buf, len, captures)) {
+ *	printf("Request line length: %d\n", captures[0].len);
+ *	printf("Method: %.*s\n", captures[1].len, captures[1].ptr);
+ *	printf("URI: %.*s\n", captures[2].len, captures[2].ptr);
+ * }
+ *
+ * Supported syntax:
+ *	^		Match beginning of a buffer
+ *	$		Match end of a buffer
+ *	()		Grouping and substring capturing
+ *	[...]		Match any character from set
+ *	[^...]		Match any character but ones from set
+ *	\s		Match whitespace
+ *	\S		Match non-whitespace
+ *	\d		Match decimal digit
+ *	\r		Match carriage return
+ *	\n		Match newline
+ *	+		Match one or more times (greedy)
+ *	+?		Match one or more times (non-greedy)
+ *	*		Match zero or more times (greedy)
+ *	*?		Match zero or more times (non-greedy)
+ *	?		Match zero or once
+ *	\xDD		Match byte with hex value 0xDD
+ *	\meta		Match one of the meta character: ^$().[*+?\
+ */
+
+#ifndef SLRE_HEADER_DEFINED
+#define	SLRE_HEADER_DEFINED
+
+/*
+ * Compiled regular expression
+ */
+struct slre {
+	unsigned char	code[256];
+	unsigned char	data[256];
+	int		code_size;
+	int		data_size;
+	int		num_caps;	/* Number of bracket pairs	*/
+	int		anchored;	/* Must match from string start	*/
+	const char	*err_str;	/* Error string			*/
+};
+
+/*
+ * Captured substring
+ */
+struct cap {
+	const char	*ptr;		/* Pointer to the substring	*/
+	int		len;		/* Substring length		*/
+};
+
+/*
+ * Compile regular expression. If success, 1 is returned.
+ * If error, 0 is returned and slre.err_str points to the error message. 
+ */
+int slre_compile(struct slre *, const char *re);
+
+/*
+ * Return 1 if match, 0 if no match. 
+ * If `captured_substrings' array is not NULL, then it is filled with the
+ * values of captured substrings. captured_substrings[0] element is always
+ * a full matched substring. The round bracket captures start from
+ * captured_substrings[1].
+ * It is assumed that the size of captured_substrings array is enough to
+ * hold all captures. The caller function must make sure it is! So, the
+ * array_size = number_of_round_bracket_pairs + 1
+ */
+int slre_match(const struct slre *, const char *buf, int buf_len,
+	struct cap *captured_substrings);
+
+#endif /* SLRE_HEADER_DEFINED */

D vendor/trex.nim

@@ -1,91 +0,0 @@ 
-{.compile: "vendor/T-Rex/libtrex.c".}
-{.push importc.}
-when not(defined(TREX_H)): 
-  const 
-    TREX_H* = true
-  #**************************************************************
-  # T-Rex a tiny regular expression library
-  #
-  # Copyright (C) 2003-2006 Alberto Demichelis
-  #
-  # This software is provided 'as-is', without any express 
-  # or implied warranty. In no event will the authors be held 
-  # liable for any damages arising from the use of this software.
-  #
-  # Permission is granted to anyone to use this software for 
-  # any purpose, including commercial applications, and to alter
-  # it and redistribute it freely, subject to the following restrictions:
-  #
-  #  1. The origin of this software must not be misrepresented;
-  #  you must not claim that you wrote the original software.
-  #  If you use this software in a product, an acknowledgment
-  #  in the product documentation would be appreciated but
-  #  is not required.
-  #
-  #  2. Altered source versions must be plainly marked as such,
-  #  and must not be misrepresented as being the original software.
-  #
-  #  3. This notice may not be removed or altered from any
-  #  source distribution.
-  #
-  #**************************************************************
-  ##ifdef _UNICODE
-  ##define TRexChar unsigned short
-  ##define MAX_CHAR 0xFFFF
-  ##define _TREXC(c) L##c 
-  ##define trex_strlen wcslen
-  ##define trex_printf wprintf
-  ##else
-  type
-    TRex* = object
-  const 
-    MAX_CHAR* = 0x000000FF
-
-  ##endif
-  const 
-    TRex_True* = 1
-    TRex_False* = 0
-  type 
-    TRexBool* = cuint
-    TRexMatch* = object 
-      begin*: cstring
-      len*: cint
-
-  proc compile*(pattern: cstring; error: ptr cstring): ptr TRex
-  proc free*(exp: ptr TRex)
-  proc match*(exp: ptr TRex; text: cstring): TRexBool
-  proc search*(exp: ptr TRex; text: cstring; 
-                    out_begin: ptr cstring; out_end: ptr cstring): TRexBool
-  proc searchrange*(exp: ptr TRex; text_begin: cstring; 
-                         text_end: cstring; out_begin: ptr cstring; 
-                         out_end: ptr cstring): TRexBool
-  proc getsubexpcount*(exp: ptr TRex): cint
-  proc getsubexp*(exp: ptr TRex; n: cint; subexp: ptr TRexMatch): TRexBool
-
-  # High level API
-  proc match(exp: string, str: string): bool =
-    var error = "INVALID_REGEX"
-    var regex = compile(expre, error)
-    # TODO raise error if regex invalid
-    let res = regex.match(str)
-    regex.free
-    if res == 1: return true else: return false
-
-  proc submatch(exp: string, n: int): string =
-    var error = "INVALID_REGEX"
-    var regex = compile(expre, error)
-    # TODO raise error if regex invalid
-    var sub = TRexMatch()
-    let res = regex.getsubexp(n, sub)
-    regex.free
-    # TODO raise error if no submatch
-    return sub
-
-  proc submatches(exp: string): int =
-    var error = "INVALID_REGEX"
-    var regex = compile(expre, error)
-    # TODO raise error if regex invalid
-    let res = regex.getsubexpcount()
-    regex.free
-    # TODO raise error if no submatch
-    return res

all repos — min @ 1de1f3d51a6537dc7d391d86a12a67c16bd4fec7

A small but practical concatenative programming language.