all repos — min @ 1de1f3d51a6537dc7d391d86a12a67c16bd4fec7

A small but practical concatenative programming language.

Implemented support for regular expressions via slre.
h3rald h3rald@h3rald.com
Sun, 14 Dec 2014 14:46:12 +0100
commit

1de1f3d51a6537dc7d391d86a12a67c16bd4fec7

parent

a410d018e5c1c59ba5d1f1a4057cee0f0948f522

11 files changed, 923 insertions(+), 997 deletions(-)

jump to
M lib/prelude.minlib/prelude.min

@@ -20,6 +20,7 @@ [unquote] [i] :

[unquote] [apply] : [filter] [select] : [clear] [empty] : +[match] [~] : // Common Environment Variables [os "windows" ==]
A lib/regex.nim

@@ -0,0 +1,36 @@

+import tables +import ../core/parser, ../core/interpreter, ../core/utils +import ../vendor/slre + +minsym "match": + let reg = i.pop + let str = i.pop + if str.isString and reg.isString: + var matches = str.strVal.match(reg.strVal) + var res = newSeq[TMinValue](0) + for s in matches: + res.add s.newVal + i.push res.newVal + else: + i.error(errIncorrect, "Two strings are required on the stack") + +minsym "match?": + let reg = i.pop + let str = i.pop + if str.isString and reg.isString: + var matches = str.strVal.match(reg.strVal) + if matches.len > 0: + i.push true.newVal + else: + i.push false.newVal + else: + i.error(errIncorrect, "Two strings are required on the stack") + +minsym "gsub": + let s_replace = i.pop + let reg = i.pop + let s_find = i.pop + if reg.isString and s_replace.isString and s_find.isString: + i.push s_find.strVal.gsub(reg.strVal, s_replace.strVal).newVal + else: + i.error(errIncorrect, "Three strings are required on the stack")
M minim.nimminim.nim

@@ -10,7 +10,8 @@ lib/numbers,

lib/logic, lib/time, lib/io, - lib/sys + lib/sys, + lib/regex const version* = "0.1.0" var debugging = false
D vendor/T-Rex/history.txt

@@ -1,15 +0,0 @@

-===version 1.3 --fixed a bug for GCC users(thx Brendan) - -===version 1.2 --added word boundary match \b and \B --added vertical tab escape \v --\w now also matches '_' (underscore) --fixed greediness for * and + - -===version 1.1 , April 1, 2004 --fixed some minor bug --added predefined character classes(\w,\W,\s,\S etc...) - -===version 1.0 , February 23, 2004 --first public realase
D vendor/T-Rex/libtrex.c

@@ -1,652 +0,0 @@

-/* see copyright notice in trex.h */ -#include <string.h> -#include <stdlib.h> -#include <ctype.h> -#include <setjmp.h> -#include "trex.h" - -#ifdef _UINCODE -#define scisprint iswprint -#define scstrlen wcslen -#define scprintf wprintf -#define _SC(x) L(x) -#else -#define scisprint isprint -#define scstrlen strlen -#define scprintf printf -#define _SC(x) (x) -#endif - -#ifdef _DEBUG -#include <stdio.h> - -static const TRexChar *g_nnames[] = -{ - _SC("NONE"),_SC("OP_GREEDY"), _SC("OP_OR"), - _SC("OP_EXPR"),_SC("OP_NOCAPEXPR"),_SC("OP_DOT"), _SC("OP_CLASS"), - _SC("OP_CCLASS"),_SC("OP_NCLASS"),_SC("OP_RANGE"),_SC("OP_CHAR"), - _SC("OP_EOL"),_SC("OP_BOL"),_SC("OP_WB") -}; - -#endif -#define OP_GREEDY (MAX_CHAR+1) // * + ? {n} -#define OP_OR (MAX_CHAR+2) -#define OP_EXPR (MAX_CHAR+3) //parentesis () -#define OP_NOCAPEXPR (MAX_CHAR+4) //parentesis (?:) -#define OP_DOT (MAX_CHAR+5) -#define OP_CLASS (MAX_CHAR+6) -#define OP_CCLASS (MAX_CHAR+7) -#define OP_NCLASS (MAX_CHAR+8) //negates class the [^ -#define OP_RANGE (MAX_CHAR+9) -#define OP_CHAR (MAX_CHAR+10) -#define OP_EOL (MAX_CHAR+11) -#define OP_BOL (MAX_CHAR+12) -#define OP_WB (MAX_CHAR+13) - -#define TREX_SYMBOL_ANY_CHAR ('.') -#define TREX_SYMBOL_GREEDY_ONE_OR_MORE ('+') -#define TREX_SYMBOL_GREEDY_ZERO_OR_MORE ('*') -#define TREX_SYMBOL_GREEDY_ZERO_OR_ONE ('?') -#define TREX_SYMBOL_BRANCH ('|') -#define TREX_SYMBOL_END_OF_STRING ('$') -#define TREX_SYMBOL_BEGINNING_OF_STRING ('^') -#define TREX_SYMBOL_ESCAPE_CHAR ('\\') - - -typedef int TRexNodeType; - -typedef struct tagTRexNode{ - TRexNodeType type; - int left; - int right; - int next; -}TRexNode; - -struct TRex{ - const TRexChar *_eol; - const TRexChar *_bol; - const TRexChar *_p; - int _first; - int _op; - TRexNode *_nodes; - int _nallocated; - int _nsize; - int _nsubexpr; - TRexMatch *_matches; - int _currsubexp; - void *_jmpbuf; - const TRexChar **_error; -}; - -static int trex_list(TRex *exp); - -static int trex_newnode(TRex *exp, TRexNodeType type) -{ - TRexNode n; - int newid; - n.type = type; - n.next = n.right = n.left = -1; - if(type == OP_EXPR) - n.right = exp->_nsubexpr++; - if(exp->_nallocated < (exp->_nsize + 1)) { - int oldsize = exp->_nallocated; - exp->_nallocated *= 2; - exp->_nodes = (TRexNode *)realloc(exp->_nodes, exp->_nallocated * sizeof(TRexNode)); - } - exp->_nodes[exp->_nsize++] = n; - newid = exp->_nsize - 1; - return (int)newid; -} - -static void trex_error(TRex *exp,const TRexChar *error) -{ - if(exp->_error) *exp->_error = error; - longjmp(*((jmp_buf*)exp->_jmpbuf),-1); -} - -static void trex_expect(TRex *exp, int n){ - if((*exp->_p) != n) - trex_error(exp, _SC("expected paren")); - exp->_p++; -} - -static TRexChar trex_escapechar(TRex *exp) -{ - if(*exp->_p == TREX_SYMBOL_ESCAPE_CHAR){ - exp->_p++; - switch(*exp->_p) { - case 'v': exp->_p++; return '\v'; - case 'n': exp->_p++; return '\n'; - case 't': exp->_p++; return '\t'; - case 'r': exp->_p++; return '\r'; - case 'f': exp->_p++; return '\f'; - default: return (*exp->_p++); - } - } else if(!scisprint(*exp->_p)) trex_error(exp,_SC("letter expected")); - return (*exp->_p++); -} - -static int trex_charclass(TRex *exp,int classid) -{ - int n = trex_newnode(exp,OP_CCLASS); - exp->_nodes[n].left = classid; - return n; -} - -static int trex_charnode(TRex *exp,TRexBool isclass) -{ - TRexChar t; - if(*exp->_p == TREX_SYMBOL_ESCAPE_CHAR) { - exp->_p++; - switch(*exp->_p) { - case 'n': exp->_p++; return trex_newnode(exp,'\n'); - case 't': exp->_p++; return trex_newnode(exp,'\t'); - case 'r': exp->_p++; return trex_newnode(exp,'\r'); - case 'f': exp->_p++; return trex_newnode(exp,'\f'); - case 'v': exp->_p++; return trex_newnode(exp,'\v'); - case 'a': case 'A': case 'w': case 'W': case 's': case 'S': - case 'd': case 'D': case 'x': case 'X': case 'c': case 'C': - case 'p': case 'P': case 'l': case 'u': - { - t = *exp->_p; exp->_p++; - return trex_charclass(exp,t); - } - case 'b': - case 'B': - if(!isclass) { - int node = trex_newnode(exp,OP_WB); - exp->_nodes[node].left = *exp->_p; - exp->_p++; - return node; - } //else default - default: - t = *exp->_p; exp->_p++; - return trex_newnode(exp,t); - } - } - else if(!scisprint(*exp->_p)) { - - trex_error(exp,_SC("letter expected")); - } - t = *exp->_p; exp->_p++; - return trex_newnode(exp,t); -} -static int trex_class(TRex *exp) -{ - int ret = -1; - int first = -1,chain; - if(*exp->_p == TREX_SYMBOL_BEGINNING_OF_STRING){ - ret = trex_newnode(exp,OP_NCLASS); - exp->_p++; - }else ret = trex_newnode(exp,OP_CLASS); - - if(*exp->_p == ']') trex_error(exp,_SC("empty class")); - chain = ret; - while(*exp->_p != ']' && exp->_p != exp->_eol) { - if(*exp->_p == '-' && first != -1){ - int r,t; - if(*exp->_p++ == ']') trex_error(exp,_SC("unfinished range")); - r = trex_newnode(exp,OP_RANGE); - if(first>*exp->_p) trex_error(exp,_SC("invalid range")); - if(exp->_nodes[first].type == OP_CCLASS) trex_error(exp,_SC("cannot use character classes in ranges")); - exp->_nodes[r].left = exp->_nodes[first].type; - t = trex_escapechar(exp); - exp->_nodes[r].right = t; - exp->_nodes[chain].next = r; - chain = r; - first = -1; - } - else{ - if(first!=-1){ - int c = first; - exp->_nodes[chain].next = c; - chain = c; - first = trex_charnode(exp,TRex_True); - } - else{ - first = trex_charnode(exp,TRex_True); - } - } - } - if(first!=-1){ - int c = first; - exp->_nodes[chain].next = c; - chain = c; - first = -1; - } - /* hack? */ - exp->_nodes[ret].left = exp->_nodes[ret].next; - exp->_nodes[ret].next = -1; - return ret; -} - -static int trex_parsenumber(TRex *exp) -{ - int ret = *exp->_p-'0'; - int positions = 10; - exp->_p++; - while(isdigit(*exp->_p)) { - ret = ret*10+(*exp->_p++-'0'); - if(positions==1000000000) trex_error(exp,_SC("overflow in numeric constant")); - positions *= 10; - }; - return ret; -} - -static int trex_element(TRex *exp) -{ - int ret = -1; - switch(*exp->_p) - { - case '(': { - int expr,newn; - exp->_p++; - - - if(*exp->_p =='?') { - exp->_p++; - trex_expect(exp,':'); - expr = trex_newnode(exp,OP_NOCAPEXPR); - } - else - expr = trex_newnode(exp,OP_EXPR); - newn = trex_list(exp); - exp->_nodes[expr].left = newn; - ret = expr; - trex_expect(exp,')'); - } - break; - case '[': - exp->_p++; - ret = trex_class(exp); - trex_expect(exp,']'); - break; - case TREX_SYMBOL_END_OF_STRING: exp->_p++; ret = trex_newnode(exp,OP_EOL);break; - case TREX_SYMBOL_ANY_CHAR: exp->_p++; ret = trex_newnode(exp,OP_DOT);break; - default: - ret = trex_charnode(exp,TRex_False); - break; - } - - { - int op; - TRexBool isgreedy = TRex_False; - unsigned short p0 = 0, p1 = 0; - switch(*exp->_p){ - case TREX_SYMBOL_GREEDY_ZERO_OR_MORE: p0 = 0; p1 = 0xFFFF; exp->_p++; isgreedy = TRex_True; break; - case TREX_SYMBOL_GREEDY_ONE_OR_MORE: p0 = 1; p1 = 0xFFFF; exp->_p++; isgreedy = TRex_True; break; - case TREX_SYMBOL_GREEDY_ZERO_OR_ONE: p0 = 0; p1 = 1; exp->_p++; isgreedy = TRex_True; break; - case '{': - exp->_p++; - if(!isdigit(*exp->_p)) trex_error(exp,_SC("number expected")); - p0 = (unsigned short)trex_parsenumber(exp); - /*******************************/ - switch(*exp->_p) { - case '}': - p1 = p0; exp->_p++; - break; - case ',': - exp->_p++; - p1 = 0xFFFF; - if(isdigit(*exp->_p)){ - p1 = (unsigned short)trex_parsenumber(exp); - } - trex_expect(exp,'}'); - break; - default: - trex_error(exp,_SC(", or } expected")); - } - /*******************************/ - isgreedy = TRex_True; - break; - - } - if(isgreedy) { - int nnode = trex_newnode(exp,OP_GREEDY); - op = OP_GREEDY; - exp->_nodes[nnode].left = ret; - exp->_nodes[nnode].right = ((p0)<<16)|p1; - ret = nnode; - } - } - if((*exp->_p != TREX_SYMBOL_BRANCH) && (*exp->_p != ')') && (*exp->_p != TREX_SYMBOL_GREEDY_ZERO_OR_MORE) && (*exp->_p != TREX_SYMBOL_GREEDY_ONE_OR_MORE) && (*exp->_p != '\0')) { - int nnode = trex_element(exp); - exp->_nodes[ret].next = nnode; - } - - return ret; -} - -static int trex_list(TRex *exp) -{ - int ret=-1,e; - if(*exp->_p == TREX_SYMBOL_BEGINNING_OF_STRING) { - exp->_p++; - ret = trex_newnode(exp,OP_BOL); - } - e = trex_element(exp); - if(ret != -1) { - exp->_nodes[ret].next = e; - } - else ret = e; - - if(*exp->_p == TREX_SYMBOL_BRANCH) { - int temp,tright; - exp->_p++; - temp = trex_newnode(exp,OP_OR); - exp->_nodes[temp].left = ret; - tright = trex_list(exp); - exp->_nodes[temp].right = tright; - ret = temp; - } - return ret; -} - -static TRexBool trex_matchcclass(int cclass,TRexChar c) -{ - switch(cclass) { - case 'a': return isalpha(c)?TRex_True:TRex_False; - case 'A': return !isalpha(c)?TRex_True:TRex_False; - case 'w': return (isalnum(c) || c == '_')?TRex_True:TRex_False; - case 'W': return (!isalnum(c) && c != '_')?TRex_True:TRex_False; - case 's': return isspace(c)?TRex_True:TRex_False; - case 'S': return !isspace(c)?TRex_True:TRex_False; - case 'd': return isdigit(c)?TRex_True:TRex_False; - case 'D': return !isdigit(c)?TRex_True:TRex_False; - case 'x': return isxdigit(c)?TRex_True:TRex_False; - case 'X': return !isxdigit(c)?TRex_True:TRex_False; - case 'c': return iscntrl(c)?TRex_True:TRex_False; - case 'C': return !iscntrl(c)?TRex_True:TRex_False; - case 'p': return ispunct(c)?TRex_True:TRex_False; - case 'P': return !ispunct(c)?TRex_True:TRex_False; - case 'l': return islower(c)?TRex_True:TRex_False; - case 'u': return isupper(c)?TRex_True:TRex_False; - } - return TRex_False; /*cannot happen*/ -} - -static TRexBool trex_matchclass(TRex* exp,TRexNode *node,TRexChar c) -{ - do { - switch(node->type) { - case OP_RANGE: - if(c >= node->left && c <= node->right) return TRex_True; - break; - case OP_CCLASS: - if(trex_matchcclass(node->left,c)) return TRex_True; - break; - default: - if(c == node->type)return TRex_True; - } - } while((node->next != -1) && (node = &exp->_nodes[node->next])); - return TRex_False; -} - -static const TRexChar *trex_matchnode(TRex* exp,TRexNode *node,const TRexChar *str,TRexNode *next) -{ - - TRexNodeType type = node->type; - switch(type) { - case OP_GREEDY: { - //TRexNode *greedystop = (node->next != -1) ? &exp->_nodes[node->next] : NULL; - TRexNode *greedystop = NULL; - int p0 = (node->right >> 16)&0x0000FFFF, p1 = node->right&0x0000FFFF, nmaches = 0; - const TRexChar *s=str, *good = str; - - if(node->next != -1) { - greedystop = &exp->_nodes[node->next]; - } - else { - greedystop = next; - } - - while((nmaches == 0xFFFF || nmaches < p1)) { - - const TRexChar *stop; - if(!(s = trex_matchnode(exp,&exp->_nodes[node->left],s,greedystop))) - break; - nmaches++; - good=s; - if(greedystop) { - //checks that 0 matches satisfy the expression(if so skips) - //if not would always stop(for instance if is a '?') - if(greedystop->type != OP_GREEDY || - (greedystop->type == OP_GREEDY && ((greedystop->right >> 16)&0x0000FFFF) != 0)) - { - TRexNode *gnext = NULL; - if(greedystop->next != -1) { - gnext = &exp->_nodes[greedystop->next]; - }else if(next && next->next != -1){ - gnext = &exp->_nodes[next->next]; - } - stop = trex_matchnode(exp,greedystop,s,gnext); - if(stop) { - //if satisfied stop it - if(p0 == p1 && p0 == nmaches) break; - else if(nmaches >= p0 && p1 == 0xFFFF) break; - else if(nmaches >= p0 && nmaches <= p1) break; - } - } - } - - if(s >= exp->_eol) - break; - } - if(p0 == p1 && p0 == nmaches) return good; - else if(nmaches >= p0 && p1 == 0xFFFF) return good; - else if(nmaches >= p0 && nmaches <= p1) return good; - return NULL; - } - case OP_OR: { - const TRexChar *asd = str; - TRexNode *temp=&exp->_nodes[node->left]; - while( (asd = trex_matchnode(exp,temp,asd,NULL)) ) { - if(temp->next != -1) - temp = &exp->_nodes[temp->next]; - else - return asd; - } - asd = str; - temp = &exp->_nodes[node->right]; - while( (asd = trex_matchnode(exp,temp,asd,NULL)) ) { - if(temp->next != -1) - temp = &exp->_nodes[temp->next]; - else - return asd; - } - return NULL; - break; - } - case OP_EXPR: - case OP_NOCAPEXPR:{ - TRexNode *n = &exp->_nodes[node->left]; - const TRexChar *cur = str; - int capture = -1; - if(node->type != OP_NOCAPEXPR && node->right == exp->_currsubexp) { - capture = exp->_currsubexp; - exp->_matches[capture].begin = cur; - exp->_currsubexp++; - } - - do { - TRexNode *subnext = NULL; - if(n->next != -1) { - subnext = &exp->_nodes[n->next]; - }else { - subnext = next; - } - if(!(cur = trex_matchnode(exp,n,cur,subnext))) { - if(capture != -1){ - exp->_matches[capture].begin = 0; - exp->_matches[capture].len = 0; - } - return NULL; - } - } while((n->next != -1) && (n = &exp->_nodes[n->next])); - - if(capture != -1) - exp->_matches[capture].len = cur - exp->_matches[capture].begin; - return cur; - } - case OP_WB: - if(str == exp->_bol && !isspace(*str) - || (str == exp->_eol && !isspace(*(str-1))) - || (!isspace(*str) && isspace(*(str+1))) - || (isspace(*str) && !isspace(*(str+1))) ) { - return (node->left == 'b')?str:NULL; - } - return (node->left == 'b')?NULL:str; - case OP_BOL: - if(str == exp->_bol) return str; - return NULL; - case OP_EOL: - if(str == exp->_eol) return str; - return NULL; - case OP_DOT:{ - *str++; - } - return str; - case OP_NCLASS: - case OP_CLASS: - if(trex_matchclass(exp,&exp->_nodes[node->left],*str)?(type == OP_CLASS?TRex_True:TRex_False):(type == OP_NCLASS?TRex_True:TRex_False)) { - *str++; - return str; - } - return NULL; - case OP_CCLASS: - if(trex_matchcclass(node->left,*str)) { - *str++; - return str; - } - return NULL; - default: /* char */ - if(*str != node->type) return NULL; - *str++; - return str; - } - return NULL; -} - -/* public api */ -TRex *trex_compile(const TRexChar *pattern,const TRexChar **error) -{ - TRex *exp = (TRex *)malloc(sizeof(TRex)); - exp->_eol = exp->_bol = NULL; - exp->_p = pattern; - exp->_nallocated = (int)scstrlen(pattern) * sizeof(TRexChar); - exp->_nodes = (TRexNode *)malloc(exp->_nallocated * sizeof(TRexNode)); - exp->_nsize = 0; - exp->_matches = 0; - exp->_nsubexpr = 0; - exp->_first = trex_newnode(exp,OP_EXPR); - exp->_error = error; - exp->_jmpbuf = malloc(sizeof(jmp_buf)); - if(setjmp(*((jmp_buf*)exp->_jmpbuf)) == 0) { - int res = trex_list(exp); - exp->_nodes[exp->_first].left = res; - if(*exp->_p!='\0') - trex_error(exp,_SC("unexpected character")); -#ifdef _DEBUG - { - int nsize,i; - TRexNode *t; - nsize = exp->_nsize; - t = &exp->_nodes[0]; - scprintf(_SC("\n")); - for(i = 0;i < nsize; i++) { - if(exp->_nodes[i].type>MAX_CHAR) - scprintf(_SC("[%02d] %10s "),i,g_nnames[exp->_nodes[i].type-MAX_CHAR]); - else - scprintf(_SC("[%02d] %10c "),i,exp->_nodes[i].type); - scprintf(_SC("left %02d right %02d next %02d\n"),exp->_nodes[i].left,exp->_nodes[i].right,exp->_nodes[i].next); - } - scprintf(_SC("\n")); - } -#endif - exp->_matches = (TRexMatch *) malloc(exp->_nsubexpr * sizeof(TRexMatch)); - memset(exp->_matches,0,exp->_nsubexpr * sizeof(TRexMatch)); - } - else{ - trex_free(exp); - return NULL; - } - return exp; -} - -void trex_free(TRex *exp) -{ - if(exp) { - if(exp->_nodes) free(exp->_nodes); - if(exp->_jmpbuf) free(exp->_jmpbuf); - if(exp->_matches) free(exp->_matches); - free(exp); - } -} - -TRexBool trex_match(TRex* exp,const TRexChar* text) -{ - const TRexChar* res = NULL; - exp->_bol = text; - exp->_eol = text + scstrlen(text); - exp->_currsubexp = 0; - res = trex_matchnode(exp,exp->_nodes,text,NULL); - - #ifdef _DEBUG - scprintf("DEBUG trex_match: res = '%s'\n", res); - scprintf("DEBUG trex_match: exp->_eol = '%s'\n", exp->_eol); - #endif - - // Fail match if trex_matchnode returns nothing - if (!res) { - return TRex_False; - } - - return TRex_True; -} - -TRexBool trex_searchrange(TRex* exp,const TRexChar* text_begin,const TRexChar* text_end,const TRexChar** out_begin, const TRexChar** out_end) -{ - const TRexChar *cur = NULL; - int node = exp->_first; - if(text_begin >= text_end) return TRex_False; - exp->_bol = text_begin; - exp->_eol = text_end; - do { - cur = text_begin; - while(node != -1) { - exp->_currsubexp = 0; - cur = trex_matchnode(exp,&exp->_nodes[node],cur,NULL); - if(!cur) - break; - node = exp->_nodes[node].next; - } - *text_begin++; - } while(cur == NULL && text_begin != text_end); - - if(cur == NULL) - return TRex_False; - - --text_begin; - - if(out_begin) *out_begin = text_begin; - if(out_end) *out_end = cur; - return TRex_True; -} - -TRexBool trex_search(TRex* exp,const TRexChar* text, const TRexChar** out_begin, const TRexChar** out_end) -{ - return trex_searchrange(exp,text,text + scstrlen(text),out_begin,out_end); -} - -int trex_getsubexpcount(TRex* exp) -{ - return exp->_nsubexpr; -} - -TRexBool trex_getsubexp(TRex* exp, int n, TRexMatch *subexp) -{ - if( n<0 || n >= exp->_nsubexpr) return TRex_False; - *subexp = exp->_matches[n]; - return TRex_True; -} -
D vendor/T-Rex/readme.txt

@@ -1,171 +0,0 @@

-T-REX 1.3 http://tiny-rex.sourceforge.net ----------------------------------------------------------------------- - T-Rex a tiny regular expression library - - Copyright (C) 2003-2006 Alberto Demichelis - - This software is provided 'as-is', without any express - or implied warranty. In no event will the authors be held - liable for any damages arising from the use of this software. - - Permission is granted to anyone to use this software for - any purpose, including commercial applications, and to alter - it and redistribute it freely, subject to the following restrictions: - - 1. The origin of this software must not be misrepresented; - you must not claim that you wrote the original software. - If you use this software in a product, an acknowledgment - in the product documentation would be appreciated but - is not required. - - 2. Altered source versions must be plainly marked as such, - and must not be misrepresented as being the original software. - - 3. This notice may not be removed or altered from any - source distribution. - ----------------------------------------------------------------------- -TRex implements the following expressions - -\ Quote the next metacharacter -^ Match the beginning of the string -. Match any character -$ Match the end of the string -| Alternation -() Grouping (creates a capture) -[] Character class - -==GREEDY CLOSURES== -* Match 0 or more times -+ Match 1 or more times -? Match 1 or 0 times -{n} Match exactly n times -{n,} Match at least n times -{n,m} Match at least n but not more than m times - -==ESCAPE CHARACTERS== -\t tab (HT, TAB) -\n newline (LF, NL) -\r return (CR) -\f form feed (FF) - -==PREDEFINED CLASSES== -\l lowercase next char -\u uppercase next char -\a letters -\A non letters -\w alphanimeric [0-9a-zA-Z] -\W non alphanimeric -\s space -\S non space -\d digits -\D non nondigits -\x exadecimal digits -\X non exadecimal digits -\c control charactrs -\C non control charactrs -\p punctation -\P non punctation -\b word boundary -\B non word boundary - ----------------------------------------------------------------------- -API DOC ----------------------------------------------------------------------- -TRex *trex_compile(const TRexChar *pattern,const TRexChar **error); - -compiles an expression and returns a pointer to the compiled version. -in case of failure returns NULL.The returned object has to be deleted -through the function trex_free(). - -pattern - a pointer to a zero terminated string containing the pattern that - has to be compiled. -error - apointer to a string pointer that will be set with an error string - in case of failure. - ----------------------------------------------------------------------- -void trex_free(TRex *exp) - -deletes a expression structure created with trex_compile() - -exp - the expression structure that has to be deleted - ----------------------------------------------------------------------- -TRexBool trex_match(TRex* exp,const TRexChar* text) - -returns TRex_True if the string specified in the parameter text is an -exact match of the expression, otherwise returns TRex_False. - -exp - the compiled expression -text - the string that has to be tested - ----------------------------------------------------------------------- -TRexBool trex_search(TRex* exp,const TRexChar* text, const TRexChar** out_begin, const TRexChar** out_end) - -searches the first match of the expressin in the string specified in the parameter text. -if the match is found returns TRex_True and the sets out_begin to the beginning of the -match and out_end at the end of the match; otherwise returns TRex_False. - -exp - the compiled expression -text - the string that has to be tested -out_begin - a pointer to a string pointer that will be set with the beginning of the match -out_end - a pointer to a string pointer that will be set with the end of the match - ----------------------------------------------------------------------- -TREX_API TRexBool trex_searchrange(TRex* exp,const TRexChar* text_begin,const TRexChar* text_end,const TRexChar** out_begin, const TRexChar** out_end) - -searches the first match of the expressin in the string delimited -by the parameter text_begin and text_end. -if the match is found returns TRex_True and the sets out_begin to the beginning of the -match and out_end at the end of the match; otherwise returns TRex_False. - -exp - the compiled expression -text_begin - a pointer to the beginnning of the string that has to be tested -text_end - a pointer to the end of the string that has to be tested -out_begin - a pointer to a string pointer that will be set with the beginning of the match -out_end - a pointer to a string pointer that will be set with the end of the match - ----------------------------------------------------------------------- -int trex_getsubexpcount(TRex* exp) - -returns the number of sub expressions matched by the expression - -exp - the compiled expression - ---------------------------------------------------------------------- -TRexBool trex_getsubexp(TRex* exp, int n, TRexMatch *submatch) - -retrieve the begin and and pointer to the length of the sub expression indexed -by n. The result is passed trhough the struct TRexMatch: - -typedef struct { - const TRexChar *begin; - int len; -} TRexMatch; - -the function returns TRex_True if n is valid index otherwise TRex_False. - -exp - the compiled expression -n - the index of the submatch -submatch - a pointer to structure that will store the result - -this function works also after a match operation has been performend. -
D vendor/T-Rex/trex.h

@@ -1,67 +0,0 @@

-#ifndef _TREX_H_ -#define _TREX_H_ -/*************************************************************** - T-Rex a tiny regular expression library - - Copyright (C) 2003-2006 Alberto Demichelis - - This software is provided 'as-is', without any express - or implied warranty. In no event will the authors be held - liable for any damages arising from the use of this software. - - Permission is granted to anyone to use this software for - any purpose, including commercial applications, and to alter - it and redistribute it freely, subject to the following restrictions: - - 1. The origin of this software must not be misrepresented; - you must not claim that you wrote the original software. - If you use this software in a product, an acknowledgment - in the product documentation would be appreciated but - is not required. - - 2. Altered source versions must be plainly marked as such, - and must not be misrepresented as being the original software. - - 3. This notice may not be removed or altered from any - source distribution. - -****************************************************************/ - -//#ifdef _UNICODE -//#define TRexChar unsigned short -//#define MAX_CHAR 0xFFFF -//#define _TREXC(c) L##c -//#define trex_strlen wcslen -//#define trex_printf wprintf -//#else -#define TRexChar char -#define MAX_CHAR 0xFF -#define _TREXC(c) (c) -#define trex_strlen strlen -#define trex_printf printf -//#endif - -#ifndef extern -#define extern extern -#endif - -#define TRex_True 1 -#define TRex_False 0 - -typedef unsigned int TRexBool; -typedef struct TRex TRex; - -typedef struct { - const TRexChar *begin; - int len; -} TRexMatch; - -extern TRex *trex_compile(const TRexChar *pattern,const TRexChar **error); -extern void trex_free(TRex *exp); -extern TRexBool trex_match(TRex* exp,const TRexChar* text); -extern TRexBool trex_search(TRex* exp,const TRexChar* text, const TRexChar** out_begin, const TRexChar** out_end); -extern TRexBool trex_searchrange(TRex* exp,const TRexChar* text_begin,const TRexChar* text_end,const TRexChar** out_begin, const TRexChar** out_end); -extern int trex_getsubexpcount(TRex* exp); -extern TRexBool trex_getsubexp(TRex* exp, int n, TRexMatch *subexp); - -#endif
A vendor/slre.nim

@@ -0,0 +1,125 @@

+# +# Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com> +# All rights reserved +# +# "THE BEER-WARE LICENSE" (Revision 42): +# Sergey Lyubka wrote this file. As long as you retain this notice you +# can do whatever you want with this stuff. If we meet some day, and you think +# this stuff is worth it, you can buy me a beer in return. +# +# +# This is a regular expression library that implements a subset of Perl RE. +# Please refer to http://slre.sourceforge.net for detailed description. +# +# Usage example (parsing HTTP request): +# +# struct slre slre; +# struct cap captures[4 + 1]; // Number of braket pairs + 1 +# ... +# +# slre_compile(&slre,"^(GET|POST) (\S+) HTTP/(\S+?)\r\n"); +# +# if (slre_match(&slre, buf, len, captures)) { +# printf("Request line length: %d\n", captures[0].len); +# printf("Method: %.*s\n", captures[1].len, captures[1].ptr); +# printf("URI: %.*s\n", captures[2].len, captures[2].ptr); +# } +# +# Supported syntax: +# ^ Match beginning of a buffer +# $ Match end of a buffer +# () Grouping and substring capturing +# [...] Match any character from set +# [^...] Match any character but ones from set +# \s Match whitespace +# \S Match non-whitespace +# \d Match decimal digit +# \r Match carriage return +# \n Match newline +# + Match one or more times (greedy) +# +? Match one or more times (non-greedy) +# * Match zero or more times (greedy) +# *? Match zero or more times (non-greedy) +# ? Match zero or once +# \xDD Match byte with hex value 0xDD +# \meta Match one of the meta character: ^$().[*+?\ +# + +{.compile: "vendor/slre/libslre.c".} +# +# Compiled regular expression +# +type + slre* = object + code*: array[256, cuchar] + data*: array[256, cuchar] + code_size*: cint + data_size*: cint + num_caps*: cint # Number of bracket pairs + anchored*: cint # Must match from string start + err_str*: cstring # Error string + +# +# Captured substring +# +type + cap* = object + value*: cstring # Pointer to the substring + len*: cint # Substring length + +# +# Compile regular expression. If success, 1 is returned. +# If error, 0 is returned and slre.err_str points to the error message. +# +proc slre_compile(a2: ptr slre; re: cstring): cint {.importc.} +# +# Return 1 if match, 0 if no match. +# If `captured_substrings' array is not NULL, then it is filled with the +# values of captured substrings. captured_substrings[0] element is always +# a full matched substring. The round bracket captures start from +# captured_substrings[1]. +# It is assumed that the size of captured_substrings array is enough to +# hold all captures. The caller function must make sure it is! So, the +# array_size = number_of_round_bracket_pairs + 1 +# +proc slre_match(a2: ptr slre; buf: cstring; buf_len: cint; + captured_substrings: openarray[cap]): cint {.importc.} + +# High level API +from strutils import contains, replace, parseInt +from sequtils import delete + +proc match*(s: string, re: string): seq[string] = + var rawre = cast[ptr slre](alloc0(sizeof(slre))) + if slre_compile(rawre, re) == 1: + var matches:array[10, cap] + if rawre.slre_match(s.cstring, s.len.cint, matches) == 1: + var res = newSeq[string](0) + for i in items(matches): + if i.value != nil: + var str = $(i.value) + res.add str.substr(0, i.len-1) + return res + else: + return newSeq[string](0) + else: + raise newException(EInvalidValue, $(rawre.err_str)) + +proc gsub*(s_find: string, re: string, s_replace): string = + var matches = s_find.match(re) + if matches.len > 0: + var res = s_find.replace(matches[0], s_replace) + if matches.len > 1: + # Replace captures + var caps = res.match("\\$(\\d)") + if caps.len > 1: + # Remove first (global) match + caps.delete(0, 0) + for c in caps: + var ci = parseInt(c) + # Replace $-placeholders with captures + while res.contains("$"&c): + res = res.replace("$"&c, matches[ci]) + return res + else: + return s_find
A vendor/slre/libslre.c

@@ -0,0 +1,667 @@

+/* + * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com> + * All rights reserved + * + * "THE BEER-WARE LICENSE" (Revision 42): + * Sergey Lyubka wrote this file. As long as you retain this notice you + * can do whatever you want with this stuff. If we meet some day, and you think + * this stuff is worth it, you can buy me a beer in return. + */ + +#include <stdio.h> +#include <assert.h> +#include <ctype.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include "slre.h" + +enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL, + STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT}; + +static struct { + const char *name; + int narg; + const char *flags; +} opcodes[] = { + {"END", 0, ""}, /* End of code block or program */ + {"BRANCH", 2, "oo"}, /* Alternative operator, "|" */ + {"ANY", 0, ""}, /* Match any character, "." */ + {"EXACT", 2, "d"}, /* Match exact string */ + {"ANYOF", 2, "D"}, /* Match any from set, "[]" */ + {"ANYBUT", 2, "D"}, /* Match any but from set, "[^]"*/ + {"OPEN ", 1, "i"}, /* Capture start, "(" */ + {"CLOSE", 1, "i"}, /* Capture end, ")" */ + {"BOL", 0, ""}, /* Beginning of string, "^" */ + {"EOL", 0, ""}, /* End of string, "$" */ + {"STAR", 1, "o"}, /* Match zero or more times "*" */ + {"PLUS", 1, "o"}, /* Match one or more times, "+" */ + {"STARQ", 1, "o"}, /* Non-greedy STAR, "*?" */ + {"PLUSQ", 1, "o"}, /* Non-greedy PLUS, "+?" */ + {"QUEST", 1, "o"}, /* Match zero or one time, "?" */ + {"SPACE", 0, ""}, /* Match whitespace, "\s" */ + {"NONSPACE", 0, ""}, /* Match non-space, "\S" */ + {"DIGIT", 0, ""} /* Match digit, "\d" */ +}; + +/* + * Commands and operands are all unsigned char (1 byte long). All code offsets + * are relative to current address, and positive (always point forward). Data + * offsets are absolute. Commands with operands: + * + * BRANCH offset1 offset2 + * Try to match the code block that follows the BRANCH instruction + * (code block ends with END). If no match, try to match code block that + * starts at offset1. If either of these match, jump to offset2. + * + * EXACT data_offset data_length + * Try to match exact string. String is recorded in data section from + * data_offset, and has length data_length. + * + * OPEN capture_number + * CLOSE capture_number + * If the user have passed 'struct cap' array for captures, OPEN + * records the beginning of the matched substring (cap->ptr), CLOSE + * sets the length (cap->len) for respective capture_number. + * + * STAR code_offset + * PLUS code_offset + * QUEST code_offset + * *, +, ?, respectively. Try to gobble as much as possible from the + * matched buffer, until code block that follows these instructions + * matches. When the longest possible string is matched, + * jump to code_offset + * + * STARQ, PLUSQ are non-greedy versions of STAR and PLUS. + */ + +static const char *meta_chars = "|.^$*+?()[\\"; + +static void +print_character_set(FILE *fp, const unsigned char *p, int len) +{ + int i; + + for (i = 0; i < len; i++) { + if (i > 0) + (void) fputc(',', fp); + if (p[i] == 0) { + i++; + if (p[i] == 0) + (void) fprintf(fp, "\\x%02x", p[i]); + else + (void) fprintf(fp, "%s", opcodes[p[i]].name); + } else if (isprint(p[i])) { + (void) fputc(p[i], fp); + } else { + (void) fprintf(fp,"\\x%02x", p[i]); + } + } +} + +void +slre_dump(const struct slre *r, FILE *fp) +{ + int i, j, ch, op, pc; + + for (pc = 0; pc < r->code_size; pc++) { + + op = r->code[pc]; + (void) fprintf(fp, "%3d %s ", pc, opcodes[op].name); + + for (i = 0; opcodes[op].flags[i] != '\0'; i++) + switch (opcodes[op].flags[i]) { + case 'i': + (void) fprintf(fp, "%d ", r->code[pc + 1]); + pc++; + break; + case 'o': + (void) fprintf(fp, "%d ", + pc + r->code[pc + 1] - i); + pc++; + break; + case 'D': + print_character_set(fp, r->data + + r->code[pc + 1], r->code[pc + 2]); + pc += 2; + break; + case 'd': + (void) fputc('"', fp); + for (j = 0; j < r->code[pc + 2]; j++) { + ch = r->data[r->code[pc + 1] + j]; + if (isprint(ch)) + (void) fputc(ch, fp); + else + (void) fprintf(fp,"\\x%02x",ch); + } + (void) fputc('"', fp); + pc += 2; + break; + } + + (void) fputc('\n', fp); + } +} + +static void +set_jump_offset(struct slre *r, int pc, int offset) +{ + assert(offset < r->code_size); + + if (r->code_size - offset > 0xff) { + r->err_str = "Jump offset is too big"; + } else { + r->code[pc] = (unsigned char) (r->code_size - offset); + } +} + +static void +emit(struct slre *r, int code) +{ + if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0]))) + r->err_str = "RE is too long (code overflow)"; + else + r->code[r->code_size++] = (unsigned char) code; +} + +static void +store_char_in_data(struct slre *r, int ch) +{ + if (r->data_size >= (int) sizeof(r->data)) + r->err_str = "RE is too long (data overflow)"; + else + r->data[r->data_size++] = ch; +} + +static void +exact(struct slre *r, const char **re) +{ + int old_data_size = r->data_size; + + while (**re != '\0' && (strchr(meta_chars, **re)) == NULL) + store_char_in_data(r, *(*re)++); + + emit(r, EXACT); + emit(r, old_data_size); + emit(r, r->data_size - old_data_size); +} + +static int +get_escape_char(const char **re) +{ + int res; + + switch (*(*re)++) { + case 'n': res = '\n'; break; + case 'r': res = '\r'; break; + case 't': res = '\t'; break; + case '0': res = 0; break; + case 'S': res = NONSPACE << 8; break; + case 's': res = SPACE << 8; break; + case 'd': res = DIGIT << 8; break; + default: res = (*re)[-1]; break; + } + + return (res); +} + +static void +anyof(struct slre *r, const char **re) +{ + int esc, old_data_size = r->data_size, op = ANYOF; + + if (**re == '^') { + op = ANYBUT; + (*re)++; + } + + while (**re != '\0') + + switch (*(*re)++) { + case ']': + emit(r, op); + emit(r, old_data_size); + emit(r, r->data_size - old_data_size); + return; + /* NOTREACHED */ + break; + case '\\': + esc = get_escape_char(re); + if ((esc & 0xff) == 0) { + store_char_in_data(r, 0); + store_char_in_data(r, esc >> 8); + } else { + store_char_in_data(r, esc); + } + break; + default: + store_char_in_data(r, (*re)[-1]); + break; + } + + r->err_str = "No closing ']' bracket"; +} + +static void +relocate(struct slre *r, int begin, int shift) +{ + emit(r, END); + memmove(r->code + begin + shift, r->code + begin, r->code_size - begin); + r->code_size += shift; +} + +static void +quantifier(struct slre *r, int prev, int op) +{ + if (r->code[prev] == EXACT && r->code[prev + 2] > 1) { + r->code[prev + 2]--; + emit(r, EXACT); + emit(r, r->code[prev + 1] + r->code[prev + 2]); + emit(r, 1); + prev = r->code_size - 3; + } + relocate(r, prev, 2); + r->code[prev] = op; + set_jump_offset(r, prev + 1, prev); +} + +static void +exact_one_char(struct slre *r, int ch) +{ + emit(r, EXACT); + emit(r, r->data_size); + emit(r, 1); + store_char_in_data(r, ch); +} + +static void +fixup_branch(struct slre *r, int fixup) +{ + if (fixup > 0) { + emit(r, END); + set_jump_offset(r, fixup, fixup - 2); + } +} + +static void +compile(struct slre *r, const char **re) +{ + int op, esc, branch_start, last_op, fixup, cap_no, level; + + fixup = 0; + level = r->num_caps; + branch_start = last_op = r->code_size; + + for (;;) + switch (*(*re)++) { + case '\0': + (*re)--; + return; + /* NOTREACHED */ + break; + case '^': + emit(r, BOL); + break; + case '$': + emit(r, EOL); + break; + case '.': + last_op = r->code_size; + emit(r, ANY); + break; + case '[': + last_op = r->code_size; + anyof(r, re); + break; + case '\\': + last_op = r->code_size; + esc = get_escape_char(re); + if (esc & 0xff00) { + emit(r, esc >> 8); + } else { + exact_one_char(r, esc); + } + break; + case '(': + last_op = r->code_size; + cap_no = ++r->num_caps; + emit(r, OPEN); + emit(r, cap_no); + + compile(r, re); + if (*(*re)++ != ')') { + r->err_str = "No closing bracket"; + return; + } + + emit(r, CLOSE); + emit(r, cap_no); + break; + case ')': + (*re)--; + fixup_branch(r, fixup); + if (level == 0) { + r->err_str = "Unbalanced brackets"; + return; + } + return; + /* NOTREACHED */ + break; + case '+': + case '*': + op = (*re)[-1] == '*' ? STAR: PLUS; + if (**re == '?') { + (*re)++; + op = op == STAR ? STARQ : PLUSQ; + } + quantifier(r, last_op, op); + break; + case '?': + quantifier(r, last_op, QUEST); + break; + case '|': + fixup_branch(r, fixup); + relocate(r, branch_start, 3); + r->code[branch_start] = BRANCH; + set_jump_offset(r, branch_start + 1, branch_start); + fixup = branch_start + 2; + r->code[fixup] = 0xff; + break; + default: + (*re)--; + last_op = r->code_size; + exact(r, re); + break; + } +} + +int +slre_compile(struct slre *r, const char *re) +{ + r->err_str = NULL; + r->code_size = r->data_size = r->num_caps = r->anchored = 0; + + if (*re == '^') + r->anchored++; + + emit(r, OPEN); /* This will capture what matches full RE */ + emit(r, 0); + + while (*re != '\0') + compile(r, &re); + + if (r->code[2] == BRANCH) + fixup_branch(r, 4); + + emit(r, CLOSE); + emit(r, 0); + emit(r, END); + + return (r->err_str == NULL ? 1 : 0); +} + +static int match(const struct slre *, int, + const char *, int, int *, struct cap *); + +static void +loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs) +{ + int saved_offset, matched_offset; + + saved_offset = matched_offset = *ofs; + + while (match(r, pc + 2, s, len, ofs, NULL)) { + saved_offset = *ofs; + if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL)) + matched_offset = saved_offset; + *ofs = saved_offset; + } + + *ofs = matched_offset; +} + +static void +loop_non_greedy(const struct slre *r, int pc, const char *s,int len, int *ofs) +{ + int saved_offset = *ofs; + + while (match(r, pc + 2, s, len, ofs, NULL)) { + saved_offset = *ofs; + if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL)) + break; + } + + *ofs = saved_offset; +} + +static int +is_any_of(const unsigned char *p, int len, const char *s, int *ofs) +{ + int i, ch; + + ch = s[*ofs]; + + for (i = 0; i < len; i++) + if (p[i] == ch) { + (*ofs)++; + return (1); + } + + return (0); +} + +static int +is_any_but(const unsigned char *p, int len, const char *s, int *ofs) +{ + int i, ch; + + ch = s[*ofs]; + + for (i = 0; i < len; i++) + if (p[i] == ch) + return (0); + + (*ofs)++; + return (1); +} + +static int +match(const struct slre *r, int pc, const char *s, int len, + int *ofs, struct cap *caps) +{ + int n, saved_offset, res = 1; + + while (res && r->code[pc] != END) { + + assert(pc < r->code_size); + assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0]))); + + switch (r->code[pc]) { + case BRANCH: + saved_offset = *ofs; + res = match(r, pc + 3, s, len, ofs, caps); + if (res == 0) { + *ofs = saved_offset; + res = match(r, pc + r->code[pc + 1], + s, len, ofs, caps); + } + pc += r->code[pc + 2]; + break; + case EXACT: + res = 0; + n = r->code[pc + 2]; /* String length */ + if (n <= len - *ofs && !memcmp(s + *ofs, r->data + + r->code[pc + 1], n)) { + (*ofs) += n; + res = 1; + } + pc += 3; + break; + case QUEST: + res = 1; + saved_offset = *ofs; + if (!match(r, pc + 2, s, len, ofs, caps)) + *ofs = saved_offset; + pc += r->code[pc + 1]; + break; + case STAR: + res = 1; + loop_greedy(r, pc, s, len, ofs); + pc += r->code[pc + 1]; + break; + case STARQ: + res = 1; + loop_non_greedy(r, pc, s, len, ofs); + pc += r->code[pc + 1]; + break; + case PLUS: + if ((res = match(r, pc + 2, s, len, ofs, caps)) == 0) + break; + + loop_greedy(r, pc, s, len, ofs); + pc += r->code[pc + 1]; + break; + case PLUSQ: + if ((res = match(r, pc + 2, s, len, ofs, caps)) == 0) + break; + + loop_non_greedy(r, pc, s, len, ofs); + pc += r->code[pc + 1]; + break; + case SPACE: + res = 0; + if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) { + (*ofs)++; + res = 1; + } + pc++; + break; + case NONSPACE: + res = 0; + if (*ofs <len && !isspace(((unsigned char *)s)[*ofs])) { + (*ofs)++; + res = 1; + } + pc++; + break; + case DIGIT: + res = 0; + if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) { + (*ofs)++; + res = 1; + } + pc++; + break; + case ANY: + res = 0; + if (*ofs < len) { + (*ofs)++; + res = 1; + } + pc++; + break; + case ANYOF: + res = 0; + if (*ofs < len) + res = is_any_of(r->data + r->code[pc + 1], + r->code[pc + 2], s, ofs); + pc += 3; + break; + case ANYBUT: + res = 0; + if (*ofs < len) + res = is_any_but(r->data + r->code[pc + 1], + r->code[pc + 2], s, ofs); + pc += 3; + break; + case BOL: + res = *ofs == 0 ? 1 : 0; + pc++; + break; + case EOL: + res = *ofs == len ? 1 : 0; + pc++; + break; + case OPEN: + if (caps != NULL) + caps[r->code[pc + 1]].ptr = s + *ofs; + pc += 2; + break; + case CLOSE: + if (caps != NULL) + caps[r->code[pc + 1]].len = (s + *ofs) - + caps[r->code[pc + 1]].ptr; + pc += 2; + break; + case END: + pc++; + break; + default: + printf("unknown cmd (%d) at %d\n", r->code[pc], pc); + assert(0); + break; + } + } + + return (res); +} + +int +slre_match(const struct slre *r, const char *buf, int len, + struct cap *caps) +{ + int i, ofs = 0, res = 0; + + if (r->anchored) { + res = match(r, 0, buf, len, &ofs, caps); + } else { + for (i = 0; i < len && res == 0; i++) { + ofs = i; + res = match(r, 0, buf, len, &ofs, caps); + } + } + + return (res); +} + +#ifdef TEST +int main(int argc, char *argv[]) +{ + struct slre slre; + struct cap caps[20]; + char data[1 * 1024 * 1024]; + FILE *fp; + int i, count, res, len; + + if (argc < 3) { + printf("Usage: %s 'slre' <file> [count]\n", argv[0]); + } else if ((fp = fopen(argv[2], "rb")) == NULL) { + printf("Error: cannot open %s:%s\n", argv[2], strerror(errno)); + } else if (!slre_compile(&slre, argv[1])) { + printf("Error compiling slre: %s\n", slre.err_str); + } else { + slre_dump(&slre, stderr); + + (void) memset(caps, 0, sizeof(caps)); + + /* Read first 128K of file */ + len = fread(data, 1, sizeof(data), fp); + (void) fclose(fp); + + res = 0; + count = argc > 3 ? atoi(argv[3]) : 1; + for (i = 0; i < count; i++) + res = slre_match(&slre, data, len, caps); + + printf("Result: %d\n", res); + + for (i = 0; i < 20; i++) + if (caps[i].len > 0) + printf("Substring %d: [%.*s]\n", i, + caps[i].len, caps[i].ptr); + } + + return (0); +} +#endif /* TEST */
A vendor/slre/slre.h

@@ -0,0 +1,92 @@

+/* + * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com> + * All rights reserved + * + * "THE BEER-WARE LICENSE" (Revision 42): + * Sergey Lyubka wrote this file. As long as you retain this notice you + * can do whatever you want with this stuff. If we meet some day, and you think + * this stuff is worth it, you can buy me a beer in return. + */ + +/* + * This is a regular expression library that implements a subset of Perl RE. + * Please refer to http://slre.sourceforge.net for detailed description. + * + * Usage example (parsing HTTP request): + * + * struct slre slre; + * struct cap captures[4 + 1]; // Number of braket pairs + 1 + * ... + * + * slre_compile(&slre,"^(GET|POST) (\S+) HTTP/(\S+?)\r\n"); + * + * if (slre_match(&slre, buf, len, captures)) { + * printf("Request line length: %d\n", captures[0].len); + * printf("Method: %.*s\n", captures[1].len, captures[1].ptr); + * printf("URI: %.*s\n", captures[2].len, captures[2].ptr); + * } + * + * Supported syntax: + * ^ Match beginning of a buffer + * $ Match end of a buffer + * () Grouping and substring capturing + * [...] Match any character from set + * [^...] Match any character but ones from set + * \s Match whitespace + * \S Match non-whitespace + * \d Match decimal digit + * \r Match carriage return + * \n Match newline + * + Match one or more times (greedy) + * +? Match one or more times (non-greedy) + * * Match zero or more times (greedy) + * *? Match zero or more times (non-greedy) + * ? Match zero or once + * \xDD Match byte with hex value 0xDD + * \meta Match one of the meta character: ^$().[*+?\ + */ + +#ifndef SLRE_HEADER_DEFINED +#define SLRE_HEADER_DEFINED + +/* + * Compiled regular expression + */ +struct slre { + unsigned char code[256]; + unsigned char data[256]; + int code_size; + int data_size; + int num_caps; /* Number of bracket pairs */ + int anchored; /* Must match from string start */ + const char *err_str; /* Error string */ +}; + +/* + * Captured substring + */ +struct cap { + const char *ptr; /* Pointer to the substring */ + int len; /* Substring length */ +}; + +/* + * Compile regular expression. If success, 1 is returned. + * If error, 0 is returned and slre.err_str points to the error message. + */ +int slre_compile(struct slre *, const char *re); + +/* + * Return 1 if match, 0 if no match. + * If `captured_substrings' array is not NULL, then it is filled with the + * values of captured substrings. captured_substrings[0] element is always + * a full matched substring. The round bracket captures start from + * captured_substrings[1]. + * It is assumed that the size of captured_substrings array is enough to + * hold all captures. The caller function must make sure it is! So, the + * array_size = number_of_round_bracket_pairs + 1 + */ +int slre_match(const struct slre *, const char *buf, int buf_len, + struct cap *captured_substrings); + +#endif /* SLRE_HEADER_DEFINED */
D vendor/trex.nim

@@ -1,91 +0,0 @@

-{.compile: "vendor/T-Rex/libtrex.c".} -{.push importc.} -when not(defined(TREX_H)): - const - TREX_H* = true - #************************************************************** - # T-Rex a tiny regular expression library - # - # Copyright (C) 2003-2006 Alberto Demichelis - # - # This software is provided 'as-is', without any express - # or implied warranty. In no event will the authors be held - # liable for any damages arising from the use of this software. - # - # Permission is granted to anyone to use this software for - # any purpose, including commercial applications, and to alter - # it and redistribute it freely, subject to the following restrictions: - # - # 1. The origin of this software must not be misrepresented; - # you must not claim that you wrote the original software. - # If you use this software in a product, an acknowledgment - # in the product documentation would be appreciated but - # is not required. - # - # 2. Altered source versions must be plainly marked as such, - # and must not be misrepresented as being the original software. - # - # 3. This notice may not be removed or altered from any - # source distribution. - # - #************************************************************** - ##ifdef _UNICODE - ##define TRexChar unsigned short - ##define MAX_CHAR 0xFFFF - ##define _TREXC(c) L##c - ##define trex_strlen wcslen - ##define trex_printf wprintf - ##else - type - TRex* = object - const - MAX_CHAR* = 0x000000FF - - ##endif - const - TRex_True* = 1 - TRex_False* = 0 - type - TRexBool* = cuint - TRexMatch* = object - begin*: cstring - len*: cint - - proc compile*(pattern: cstring; error: ptr cstring): ptr TRex - proc free*(exp: ptr TRex) - proc match*(exp: ptr TRex; text: cstring): TRexBool - proc search*(exp: ptr TRex; text: cstring; - out_begin: ptr cstring; out_end: ptr cstring): TRexBool - proc searchrange*(exp: ptr TRex; text_begin: cstring; - text_end: cstring; out_begin: ptr cstring; - out_end: ptr cstring): TRexBool - proc getsubexpcount*(exp: ptr TRex): cint - proc getsubexp*(exp: ptr TRex; n: cint; subexp: ptr TRexMatch): TRexBool - - # High level API - proc match(exp: string, str: string): bool = - var error = "INVALID_REGEX" - var regex = compile(expre, error) - # TODO raise error if regex invalid - let res = regex.match(str) - regex.free - if res == 1: return true else: return false - - proc submatch(exp: string, n: int): string = - var error = "INVALID_REGEX" - var regex = compile(expre, error) - # TODO raise error if regex invalid - var sub = TRexMatch() - let res = regex.getsubexp(n, sub) - regex.free - # TODO raise error if no submatch - return sub - - proc submatches(exp: string): int = - var error = "INVALID_REGEX" - var regex = compile(expre, error) - # TODO raise error if regex invalid - let res = regex.getsubexpcount() - regex.free - # TODO raise error if no submatch - return res