all repos — min @ a0598c9364e04d2c6691d8620e379fe7e82a9562

A small but practical concatenative programming language.

Replaced SLRE with SGRegex.
h3rald h3rald@h3rald.com
Sat, 28 May 2016 22:20:05 +0200
commit

a0598c9364e04d2c6691d8620e379fe7e82a9562

parent

fbacd017237284143204d32aeb792d861a6c8f47

7 files changed, 1288 insertions(+), 884 deletions(-)

jump to
A core/regex.nim

@@ -0,0 +1,60 @@

+import strutils +import ../vendor/sgregex + + +proc match*(str, pattern, mods: string): bool = + let r = srx_Create(pattern, mods) + result = srx_Match(r, str, 0) == 1 + discard srx_Destroy(r) + +proc match*(str, pattern: string): bool = + return match(str, pattern, "") + +proc search*(str, pattern, mods: string): seq[string] = + let r = srx_Create(pattern, mods) + discard srx_Match(r, str, 0) == 1 + let count = srx_GetCaptureCount(r) + result = newSeq[string](count) + for i in 0..count-1: + var first = 0 + var last = 0 + discard srx_GetCaptured(r, i, addr first, addr last) + result[i] = str.substr(first, last-1) + discard srx_Destroy(r) + +proc search*(str, pattern: string): seq[string] = + return search(str, pattern, "") + +proc replace*(str, pattern, repl, mods: string): string = + var r = srx_Create(pattern, mods) + result = $srx_Replace(r, str, repl) + discard srx_Destroy(r) + +proc replace*(str, pattern, repl: string): string = + return replace(str, pattern, repl, "") + +when isMainModule: + + proc tmatch(str, pattern: string) = + echo str, " =~ ", "/", pattern, "/", " -> ", str.match(pattern) + + proc tsearch(str, pattern: string) = + echo str, " =~ ", "/", pattern, "/", " -> ", str.search(pattern) + + proc tsearch(str, pattern, mods: string) = + echo str, " =~ ", "/", pattern, "/", mods, " -> ", str.search(pattern, mods) + + proc treplace(str, pattern, repl: string) = + echo str, " =~ ", "s/", pattern, "/", repl, "/", " -> ", str.replace(pattern, repl) + + + "HELLO".tmatch("^H(.*)O$") + "HELLO".tmatch("^H(.*)S$") + "HELLO".tsearch("^H(E)(.*)O$") + "Hello, World!".treplace("[a-zA-Z]+,", "Goodbye,") + "127.0.0.1".tsearch("^([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})$") + "127.0.0.1".treplace("^([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})$", "$4.$3.$1.$2") + "127.0.0.1".treplace("[0-9]+", "255") + "Hello".tsearch("HELLO", "i") + "Hello\nWorld!".tsearch("HELLO.WORLD", "mis") +
A vendor/sgregex.nim

@@ -0,0 +1,57 @@

+{.compile: "sgregex/libregex.c".} +const + RXSUCCESS* = 0 + RXEINMOD* = - 1 + RXEPART* = - 2 + RXEUNEXP* = - 3 + RXERANGE* = - 4 + RXELIMIT* = - 5 + RXEEMPTY* = - 6 + RXENOREF* = - 7 + RX_ALLMODS* = "mis" + + +type + srx_MemFunc* = proc (a2: pointer; a3: pointer; a4: csize): pointer + +proc RX_STRLENGTHFUNC*(str: string): int = + return str.len + +proc srx_DefaultMemFunc*(userdata: pointer, ptr1: pointer, size: csize): pointer = + #cast[ptr string](userdata) + #if not size.isNil: + #return realloc(ptr, size) + #free(ptr1) + return nil + +type + srx_Context* = object + +{.push importc.} +proc srx_CreateExt*(str: cstring; strsize: csize; mods: cstring; + errnpos: ptr cint; memfn: srx_MemFunc; memctx: pointer): ptr srx_Context +template srx_Create*(str, mods: expr): expr = + srx_CreateExt(str, RX_STRLENGTHFUNC(str), mods, nil, nil, nil)#srx_DefaultMemFunc, nil) + +proc srx_Destroy*(R: ptr srx_Context): cint +proc srx_DumpToStdout*(R: ptr srx_Context) +proc srx_MatchExt*(R: ptr srx_Context; str: cstring; size: csize; + offset: csize): cint +template srx_Match*(R, str, off: expr): expr = + srx_MatchExt(R, str, RX_STRLENGTHFUNC(str), off) + +proc srx_GetCaptureCount*(R: ptr srx_Context): cint +proc srx_GetCaptured*(R: ptr srx_Context; which: cint; pbeg: ptr csize; + pend: ptr csize): cint +proc srx_GetCapturedPtrs*(R: ptr srx_Context; which: cint; + pbeg: cstringArray; pend: cstringArray): cint +proc srx_ReplaceExt*(R: ptr srx_Context; str: cstring; strsize: csize; + rep: cstring; repsize: csize; outsize: ptr csize): cstring +template srx_Replace*(R, str, rep: expr): expr = + srx_ReplaceExt(R, str, RX_STRLENGTHFUNC(str), rep, RX_STRLENGTHFUNC(rep), nil) + +proc srx_FreeReplaced*(R: ptr srx_Context; repstr: cstring) + + + +
A vendor/sgregex/libregex.c

@@ -0,0 +1,1101 @@

+ +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> + +#define MAX(a,b) ((a)>(b)?(a):(b)) + + +#define RX_NEED_DEFAULT_MEMFUNC +#include "regex.h" + + +#define RX_MAX_CAPTURES 10 + + +#define RX_MALLOC( bytes ) R->memfn( R->memctx, NULL, bytes ) +#define RX_ALLOC_N( what, N ) (what*) R->memfn( R->memctx, NULL, sizeof( what ) * ((size_t)(N)) ) +#define RX_ALLOC( what ) RX_ALLOC_N( what, 1 ) +#define RX_FREE( ptr ) R->memfn( R->memctx, ptr, 0 ) + +#define RX_IS_ALPHA( x ) rx_isalpha( x ) +#define RX_EQUALIZE( x ) rx_tolower( x ) + + +#define RIT_MATCH 1 /* matching */ +#define RIT_RANGE 2 +#define RIT_SPCBEG 3 +#define RIT_SPCEND 4 +#define RIT_BKREF 5 +#define RIT_EITHER 11 /* control */ +#define RIT_SUBEXP 12 + +#define RIF_LAZY 0x01 +#define RIF_INVERT 0x02 + +#define RCF_MULTILINE 0x01 /* ^/$ matches beginning/end of line too */ +#define RCF_CASELESS 0x02 /* pre-equalized case for match/range */ +#define RCF_DOTALL 0x04 /* "." is compiled as "[^]" instead of "[^\r\n]" */ + +#ifndef RXLOG +#define RXLOG 0 +#endif + +#if RXLOG +#define RXLOGINFO( x ) x +#else +#define RXLOGINFO( x ) +#endif +#define RX_LOGLIM( str, strend, off ) (int)((strend)-(str)<(off)?(strend)-(str):(off)), str + + +static int rx_isalpha( RX_Char c ) +{ + return ( c >= 'a' && c <= 'z' ) + || ( c >= 'A' && c <= 'Z' ); +} + +static RX_Char rx_tolower( RX_Char c ) +{ + if( c >= 'A' && c <= 'Z' ) + return (RX_Char)( c - 'A' + 'a' ); + return c; +} + + +typedef struct _regex_item regex_item; +struct _regex_item +{ + /* structure */ + regex_item* prev; + regex_item* next; + regex_item* ch, *ch2; + regex_item* pos; + + RX_Char* range; + int count; + + int type, flags; + RX_Char a; + int min, max; + + /* match state */ + const RX_Char *matchbeg, *matchend; + int counter; +}; + +struct _srx_Context +{ + /* structure */ + regex_item* root; + int flags; + + /* memory */ + srx_MemFunc memfn; + void* memctx; + + /* captures */ + regex_item* caps[ RX_MAX_CAPTURES ]; + int numcaps; + + /* temporary data */ + const RX_Char* string; + const RX_Char* stringend; +}; + +typedef struct _match_ctx +{ + const RX_Char* string; + const RX_Char* stringend; + regex_item* item; + srx_Context* R; +} +match_ctx; + + +static int regex_test( const RX_Char* str, match_ctx* ctx ); + + +static int regex_match_once( match_ctx* ctx ) +{ + int i; + regex_item* item = ctx->item; + const RX_Char* str = item->matchend; + RXLOGINFO( printf( "type %d char %d('%c') action at %p (%.*s)\n", + item->type, (int) item->a, item->a, str, RX_LOGLIM(str,ctx->stringend,5) ) ); + switch( item->type ) + { + case RIT_MATCH: + if( str >= ctx->stringend ) + break; + { + RX_Char ch = *str; + if( ctx->R->flags & RCF_CASELESS ) + ch = RX_EQUALIZE( *str ); + if( ch == item->a ) + { + item->matchend++; + return 1; + } + } + break; + case RIT_RANGE: + if( str >= ctx->stringend ) + break; + { + RX_Char ch = *str; + int inv = ( item->flags & RIF_INVERT ) != 0, inrange = 0; + if( ctx->R->flags & RCF_CASELESS ) + ch = RX_EQUALIZE( *str ); + for( i = 0; i < item->count*2; i += 2 ) + { + if( ch >= item->range[i] && ch <= item->range[i+1] ) + { + inrange = 1; + break; + } + } + if( inrange ^ inv ) + { + item->matchend++; + return 1; + } + } + break; + case RIT_SPCBEG: + if( ctx->R->flags & RCF_MULTILINE && item->matchend < ctx->stringend && ( *item->matchend == '\n' || *item->matchend == '\r' ) ) + { + if( *item->matchend == '\r' && item->matchend[1] == '\n' ) + item->matchend++; + item->matchend++; + item->matchbeg = item->matchend; + return 1; + } + return ctx->string == item->matchend; + case RIT_SPCEND: + if( ctx->R->flags & RCF_MULTILINE && item->matchend < ctx->stringend && ( *item->matchend == '\n' || *item->matchend == '\r' ) ) + { + return 1; + } + return str >= ctx->stringend; + case RIT_BKREF: + { + regex_item* cap = ctx->R->caps[ (int) item->a ]; + ptrdiff_t len = cap->matchend - cap->matchbeg; + ptrdiff_t len2 = ctx->stringend - str; + if( len2 >= len && memcmp( cap->matchbeg, str, (size_t) len ) == 0 ) + { + item->matchend += len; + return 1; + } + } + break; + case RIT_SUBEXP: + { + match_ctx cc; + { + cc.string = ctx->string; + cc.stringend = ctx->stringend; + cc.item = item->pos ? item->pos : item->ch; + cc.R = ctx->R; + } + if( regex_test( str, &cc ) ) + { + regex_item* p = item->ch; + while( p->next ) + p = p->next; + item->pos = NULL; + item->matchend = p->matchend; + return 1; + } + } + break; + } + return 0; +} + +static int regex_match_many( match_ctx* ctx ) +{ + /* returns whether matched */ + regex_item* item = ctx->item; + item->matchend = item->matchbeg; + if( item->type == RIT_EITHER ) + { + regex_item* chi = item->counter ? item->ch2 : item->ch; + match_ctx cc; + { + cc.string = ctx->string; + cc.stringend = ctx->stringend; + cc.item = chi; + cc.R = ctx->R; + } + if( regex_test( item->matchbeg, &cc ) ) + { + regex_item* p = chi; + while( p->next ) + p = p->next; + item->matchend = p->matchend; + return 1; + } + return 0; + } + else + { + int i; + for( i = 0; i < item->counter; ++i ) + { + if( item->matchend >= ctx->stringend && item->type != RIT_SPCEND && item->type != RIT_EITHER && item->type != RIT_SUBEXP ) + { + item->counter = item->flags & RIF_LAZY ? item->max : i; + RXLOGINFO( printf( "stopped while matching, counter = %d, %d between %d and %d?\n", item->counter, i, item->min, item->max ) ); + return i >= item->min && i <= item->max; + } + if( !regex_match_once( ctx ) ) + { + item->counter = item->flags & RIF_LAZY ? item->max : i; + RXLOGINFO( printf( "did not match, counter reset to %d\n", item->counter ) ); + return i >= item->min && i <= item->max; + } + RXLOGINFO( else printf( "matched\n" ) ); + } + return 1; + } +} + +static void regex_full_reset( regex_item* p ); +static void regex_reset_one( regex_item* p ) +{ + if( p->ch ) regex_full_reset( p->ch ); + if( p->ch2 ) regex_full_reset( p->ch2 ); + p->pos = p->ch; + p->matchbeg = p->matchend = NULL; + p->counter = p->flags & RIF_LAZY ? p->min : p->max; +} +static void regex_full_reset( regex_item* p ) +{ + while( p ) + { + regex_reset_one( p ); + p = p->next; + } +} + +static regex_item* regex_lastch( regex_item* item ) +{ + regex_item* p = item->ch; + while( p && p->next ) + p = p->next; + return p; +} + +static int regex_subexp_backtrack( regex_item* item ) +{ + int chgh = 0; + regex_item* p = item->pos ? item->pos : regex_lastch( item ); + + while( p ) + { + RXLOGINFO( printf( "backtracker at type %d char %d\n", p->type, (int) p->a ) ); + if( chgh && p->type == RIT_SUBEXP && regex_subexp_backtrack( p ) ) + break; + else if( p->flags & RIF_LAZY ) + { + p->counter++; + if( p->counter <= p->max ) + break; + } + else + { + p->counter--; + if( p->counter >= p->min ) + break; + } + RXLOGINFO( printf( "subexp backtrack - reset current, move back\n" ) ); + regex_reset_one( p ); + p = p->prev; + chgh = 1; + } + + RXLOGINFO( printf( "subexp backtrack - %s\n", p ? "success" : "failure" ) ); + RXLOGINFO( if( p ) printf( "subexp-backtracked to type %d ctr=%d min=%d max=%d\n", p->type, p->counter, p->min, p->max ) ); + + return !!p; +} + +static int regex_test( const RX_Char* str, match_ctx* ctx ) +{ + regex_item* p = ctx->item; + p->matchbeg = str; + + for(;;) + { + int res; + match_ctx cc; + { + cc.string = ctx->string; + cc.stringend = ctx->stringend; + cc.item = p; + cc.R = ctx->R; + } + RXLOGINFO( printf( "match_many: item %p type %d at position %p (%.*s)\n", + (void*) p, p->type, p->matchbeg, RX_LOGLIM(p->matchbeg,ctx->stringend,5) ) ); + res = regex_match_many( &cc ); + if( res ) + { + p = p->next; + if( !p ) + { + RXLOGINFO( printf( "test of subexp %p SUCCEEDED\n", (void*) ctx->item ) ); + return 1; + } + RXLOGINFO( printf( "moving on to type %d action\n", p->type ) ); + p->matchbeg = p->prev->matchend; + } + else + { + int chgh = 0; + while( p ) + { + if( chgh && p->type == RIT_SUBEXP && regex_subexp_backtrack( p ) ) + break; + else if( p->flags & RIF_LAZY ) + { + p->counter++; + if( p->counter <= p->max ) + break; + } + else + { + p->counter--; + if( p->counter >= p->min ) + break; + } + RXLOGINFO( printf( "backtrack, reset current\n" ) ); + regex_reset_one( p ); + p = p->prev; + chgh = 1; + } + if( !p ) + { + RXLOGINFO( printf( "test of subexp %p BT-ENDED\n", (void*) ctx->item ) ); + return 0; + } + } + } +} + +static int regex_test_start( const RX_Char* str, match_ctx* ctx ) +{ + regex_item* p = ctx->item; + RXLOGINFO( printf( "test start - counter reset\n" ) ); + regex_reset_one( p ); + return regex_test( str, ctx ); +} + + +/* + mapping: + - [^a-zA-Z] ... RIT_RANGE, optional RIF_INVERT + - "." ... empty RIT_RANGE + RIF_INVERT + - "\s" and others ... predefined RIT_RANGE with optional RIF_INVERT + - "|" ... RIT_EITHER + - "(..)" ... RIT_SUBEXP + - "?" ... range = [0,1] + - "*" ... range = [0,INT_MAX] + - "+" ... range = [1,INT_MAX] + - "{1,5}" ... range = [1,5] (other ranges mapped similarly) + - "^" ... RIT_SPCBEG + - "$" ... RIT_SPCEND + - "\1" ... RIT_BKREF +*/ + +static void regex_free_item( srx_Context* R, regex_item* item ); +static void regex_dealloc_item( srx_Context* R, regex_item* item ) +{ + if( item->range ) + RX_FREE( item->range ); + if( item->ch ) regex_free_item( R, item->ch ); + if( item->ch2 ) regex_free_item( R, item->ch2 ); + RX_FREE( item ); +} + +static void regex_free_item( srx_Context* R, regex_item* item ) +{ + regex_item *p, *c; + if( !item ) + return; + p = item->prev; + while( p ) + { + c = p; + p = p->prev; + regex_dealloc_item( R, c ); + } + p = item->next; + while( p ) + { + c = p; + p = p->next; + regex_dealloc_item( R, c ); + } + regex_dealloc_item( R, item ); +} + +static void regex_level( regex_item** pitem ) +{ + /* TODO: balanced/non-(pseudo-)binary leveling */ + regex_item* item = *pitem; + while( item ) + { + if( item->type == RIT_EITHER ) + { + regex_item* next = item->next; + regex_level( &next ); + + if( item->prev ) + { + item->prev->next = NULL; + item->prev = NULL; + } + if( item->next ) + { + item->next->prev = NULL; + item->next = NULL; + } + + item->ch = *pitem; + item->ch2 = next; + + *pitem = item; + return; + } + item = item->next; + } +} + +static int regex_real_compile( srx_Context* R, int* cel, const RX_Char** pstr, const RX_Char* pend, int sub, regex_item** out ) +{ +#define _RX_ALLOC_NODE( ty ) \ + item = RX_ALLOC( regex_item ); \ + memset( item, 0, sizeof(*item) ); \ + if( citem ) \ + { \ + citem->next = item; \ + item->prev = citem; \ + } \ + item->type = ty; \ + item->min = 1; \ + item->max = 1; + +#define _RXE( err ) for(;;){ error = err; goto fail; } + + const RX_Char* s = *pstr; + regex_item* item = NULL, *citem = NULL; + int error = 0; + while( s < pend ) + { + if( sub && *s == ')' ) + break; + switch( *s ) + { + case '[': + { + const RX_Char* sc; + int inv = 0, cnt = 0; + RX_Char* ri; + s++; + if( *s == '^' ) + { + inv = 1; + s++; + } + sc = s; + if( *sc == ']' ) + { + sc++; + cnt++; + } + while( *sc && *sc != ']' ) + { + if( *sc == '-' && sc > s && sc[1] != 0 && sc[1] != ']' ) + sc++; + else + cnt++; + sc++; + } + if( !*sc ) + _RXE( RXEPART ); + _RX_ALLOC_NODE( RIT_RANGE ); + if( inv ) + item->flags |= RIF_INVERT; + item->range = ri = RX_ALLOC_N( RX_Char, cnt * 2 ); + item->count = cnt; + sc = s; + if( *sc == ']' ) + { + sc++; + ri[0] = ri[1] = *sc; + ri += 2; + } + while( *sc && *sc != ']' ) + { + if( *sc == '-' && sc > s && sc[1] != 0 && sc[1] != ']' ) + { + if( ri > item->range ) + *(ri-1) = sc[1]; + sc++; + } + else + { + ri[0] = ri[1] = *sc; + ri += 2; + } + sc++; + } + s = sc; + if( *s == ']' ) + s++; + if( R->flags & RCF_CASELESS ) + { + int i; + ri = item->range; + for( i = 0; i < cnt * 2; i += 2 ) + { + RX_Char A = ri[ i ], B = ri[ i + 1 ]; + if( RX_IS_ALPHA( A ) && RX_IS_ALPHA( B ) ) + { + ri[ i ] = RX_EQUALIZE( A ); + ri[ i + 1 ] = RX_EQUALIZE( B ); + } + } + } + } + break; + case ']': + _RXE( RXEUNEXP ); + case '(': + { + int r, cap = R->numcaps < RX_MAX_CAPTURES ? 1 : -1; + _RX_ALLOC_NODE( RIT_SUBEXP ); + if( cap >= 0 ) + { + cap = R->numcaps++; + R->caps[ cap ] = item; + } + s++; + r = regex_real_compile( R, cel, &s, pend, 1, &item->ch ); + if( r ) + _RXE( r ); + item->pos = item->ch; + if( *s != ')' ) + _RXE( RXEUNEXP ); + if( cap >= 0 ) + cel[ cap ] = 1; + s++; + } + break; + case ')': + _RXE( RXEUNEXP ); + case '{': + case '?': + case '*': + case '+': + if( s > *pstr && ( *(s-1) == '}' || *(s-1) == '?' || *(s-1) == '*' || *(s-1) == '+' ) ) + { + if( *s == '?' ) + item->flags |= RIF_LAZY; + else + _RXE( RXEUNEXP ); + } + else if( item && ( item->type == RIT_MATCH || item->type == RIT_RANGE || item->type == RIT_BKREF || item->type == RIT_SUBEXP ) ) + { + int min = 1, max = 1; + if( *s == '{' ) + { + int ctr; + s++; + if( !isdigit( *s ) ) + _RXE( RXEUNEXP ); + min = 0; + ctr = 8; + while( isdigit( *s ) && ctr > 0 ) + { + min = min * 10 + *s++ - '0'; + ctr--; + } + if( isdigit( *s ) && ctr == 0 ) + _RXE( RXELIMIT ); + if( *s == ',' ) + { + if( !isdigit(s[1]) ) + _RXE( RXEUNEXP ); + s++; + max = 0; + ctr = 8; + while( isdigit( *s ) && ctr > 0 ) + { + max = max * 10 + *s++ - '0'; + ctr--; + } + if( isdigit( *s ) && ctr == 0 ) + _RXE( RXELIMIT ); + if( min > max ) + _RXE( RXERANGE ); + } + else + max = min; + if( *s != '}' ) + _RXE( RXEUNEXP ); + } + else if( *s == '?' ){ min = 0; max = 1; } + else if( *s == '*' ){ min = 0; max = INT_MAX - 1; } + else if( *s == '+' ){ min = 1; max = INT_MAX - 1; } + item->min = min; + item->max = max; + } + else + _RXE( RXEUNEXP ); + s++; + break; + case '}': + _RXE( RXEUNEXP ); + case '|': + if( !citem ) + _RXE( RXEUNEXP ); + _RX_ALLOC_NODE( RIT_EITHER ); + item->min = 0; + item->max = 1; + item->flags |= RIF_LAZY; + s++; + break; + case '^': + _RX_ALLOC_NODE( RIT_SPCBEG ); + s++; + break; + case '$': + _RX_ALLOC_NODE( RIT_SPCEND ); + s++; + break; + case '\\': + if( s[1] ) + { + s++; + if( *s == '.' ) + { + _RX_ALLOC_NODE( RIT_MATCH ); + item->a = *s++; + break; + } + else if( isdigit( *s ) ) + { + int dig = *s++ - '0'; + if( dig == 0 || dig >= RX_MAX_CAPTURES || !cel[ dig ] ) + _RXE( RXENOREF ); + _RX_ALLOC_NODE( RIT_BKREF ); + item->a = (RX_Char) dig; + break; + } + else if( *s == 'd' || *s == 'D' ) + { + _RX_ALLOC_NODE( RIT_RANGE ); + item->range = RX_ALLOC_N( RX_Char, 2 ); + item->count = 1; + item->range[0] = '0'; + item->range[1] = '9'; + if( *s == 'D' ) + item->flags |= RIF_INVERT; + s++; + break; + } + else if( *s == 'h' || *s == 'H' ) + { + _RX_ALLOC_NODE( RIT_RANGE ); + item->range = RX_ALLOC_N( RX_Char, 2 * 2 ); + item->count = 2; + item->range[0] = item->range[1] = '\t'; + item->range[2] = item->range[3] = ' '; + if( *s == 'H' ) + item->flags |= RIF_INVERT; + s++; + break; + } + else if( *s == 'v' || *s == 'V' ) + { + _RX_ALLOC_NODE( RIT_RANGE ); + item->range = RX_ALLOC_N( RX_Char, 2 ); + item->count = 1; + item->range[0] = 0x0A; + item->range[1] = 0x0D; + if( *s == 'V' ) + item->flags |= RIF_INVERT; + s++; + break; + } + else if( *s == 's' || *s == 'S' ) + { + _RX_ALLOC_NODE( RIT_RANGE ); + item->range = RX_ALLOC_N( RX_Char, 2 * 2 ); + item->count = 2; + item->range[0] = 0x09; + item->range[1] = 0x0D; + item->range[2] = item->range[3] = ' '; + if( *s == 'S' ) + item->flags |= RIF_INVERT; + s++; + break; + } + else if( *s == 'w' || *s == 'W' ) + { + _RX_ALLOC_NODE( RIT_RANGE ); + item->range = RX_ALLOC_N( RX_Char, 2 * 4 ); + item->count = 4; + item->range[0] = 'a'; item->range[1] = 'z'; + item->range[2] = 'A'; item->range[3] = 'Z'; + item->range[4] = '0'; item->range[5] = '9'; + item->range[6] = item->range[7] = '_'; + if( *s == 'W' ) + item->flags |= RIF_INVERT; + s++; + break; + } + /* TODO: more character classes */ + } + else + _RXE( RXEPART ); + default: + if( *s == '.' ) + { + _RX_ALLOC_NODE( RIT_RANGE ); + if( !( R->flags & RCF_DOTALL ) ) + { + item->range = RX_ALLOC_N( RX_Char, 2 * 2 ); + item->range[0] = item->range[1] = '\n'; + item->range[2] = item->range[3] = '\r'; + item->count = 2; + } + item->flags |= RIF_INVERT; + } + else + { + _RX_ALLOC_NODE( RIT_MATCH ); + item->a = *s; + if( R->flags & RCF_CASELESS && RX_IS_ALPHA( item->a ) ) + item->a = RX_EQUALIZE( item->a ); + } + s++; + break; + } + citem = item; + } + if( !item ) + _RXE( RXEEMPTY ); + if( item->type == RIT_EITHER ) + _RXE( RXEPART ); + *pstr = s; + while( item->prev ) + item = item->prev; + regex_level( &item ); + *out = item; + return RXSUCCESS; +fail: + regex_free_item( R, item ); + return (int)( ( error & 0xf ) | ( ( s - R->string ) << 4 ) ); +} + +/* + #### srx_CreateExt #### +*/ +srx_Context* srx_CreateExt( const RX_Char* str, size_t strsize, const RX_Char* mods, int* errnpos, srx_MemFunc memfn, void* memctx ) +{ + int flags = 0, err, cel[ RX_MAX_CAPTURES ]; + srx_Context* R = NULL; + if( mods ) + { + const RX_Char* modbegin = mods; + while( *mods ) + { + switch( *mods ) + { + case 'm': flags |= RCF_MULTILINE; break; + case 'i': flags |= RCF_CASELESS; break; + case 's': flags |= RCF_DOTALL; break; + default: + err = ( RXEINMOD & 0xf ) | ( (int)( mods - modbegin ) << 4 ); + goto fail; + } + mods++; + } + } + + if( !memfn ) + memfn = srx_DefaultMemFunc; + + R = (srx_Context*) memfn( memctx, NULL, sizeof(srx_Context) ); + memset( R, 0, sizeof(*R) ); + memset( cel, 0, sizeof(cel) ); + R->memfn = memfn; + R->memctx = memctx; + R->string = str; + R->stringend = str + strsize; + R->flags = flags; + R->numcaps = 1; + + err = regex_real_compile( R, cel, &str, str + strsize, 0, &R->root ); + + if( err ) + { + memfn( memctx, R, 0 ); + R = NULL; + } + else + { + regex_item* item = RX_ALLOC( regex_item ); + memset( item, 0, sizeof(*item) ); + item->type = RIT_SUBEXP; + item->min = 1; + item->max = 1; + item->pos = item->ch = R->root; + R->caps[ 0 ] = R->root = item; + } +fail: + if( errnpos ) + { + unsigned uerr = (unsigned) err; + errnpos[0] = (int)( uerr ? ( uerr & 0xf ) | 0xfffffff0 : 0 ); + errnpos[1] = (int)( ( uerr & 0xfffffff0 ) >> 4 ); + } + RXLOGINFO( if( R ) srx_DumpToStdout(R) ); + return R; +} + +/* + #### srx_Destroy #### +*/ +int srx_Destroy( srx_Context* R ) +{ + if( R ) + { + srx_MemFunc memfn = R->memfn; + void* memctx = R->memctx; + if( R->root ) + regex_free_item( R, R->root ); + memfn( memctx, R, 0 ); + } + return !!R; +} + + +static void regex_dump_list( regex_item* items, int lev ); +static void regex_dump_item( regex_item* item, int lev ) +{ + const char* types[] = + { + "-", "MATCH (1)", "RANGE (2)", "SPCBEG (3)", "SPCEND (4)", "BKREF (5)", "-", "-", "-", "-", + "-", "EITHER (11)", "SUBEXP (12)", "-" + }; + + int l = lev; + while( l --> 0 ) + printf( "- " ); + printf( "%s", types[ item->type ] ); + if( item->flags & RIF_INVERT ) printf( " INV" ); + if( item->flags & RIF_LAZY ) printf( " LAZY" ); + switch( item->type ) + { + case RIT_MATCH: printf( " char %d", (int) item->a ); break; + case RIT_RANGE: + for( l = 0; l < item->count; ++l ) + { + if( l > 0 ) + printf( "," ); + printf( " %d - %d", (int) item->range[l*2], (int) item->range[l*2+1] ); + } + break; + case RIT_BKREF: printf( " #%d", (int) item->a ); break; + } + printf( " (%d to %d) (0x%p => 0x%p)\n", item->min, item->max, item->matchbeg, item->matchend ); + + if( item->ch ) + { + regex_dump_list( item->ch, lev + 1 ); + if( item->ch2 ) + { + int l2 = lev; + while( l2 --> 0 ) + printf( "- " ); + printf( "--|\n" ); + regex_dump_list( item->ch2, lev + 1 ); + } + } +} +static void regex_dump_list( regex_item* items, int lev ) +{ + while( items ) + { + regex_dump_item( items, lev ); + items = items->next; + } +} + +/* + #### srx_DumpToStdout #### +*/ +void srx_DumpToStdout( srx_Context* R ) +{ + regex_dump_list( R->root, 0 ); +} + +/* + #### srx_Match #### +*/ +int srx_MatchExt( srx_Context* R, const RX_Char* str, size_t size, size_t offset ) +{ + int ret; + const RX_Char* strend = str + size; + match_ctx ctx; + { + ctx.string = str; + ctx.stringend = strend; + ctx.item = R->root; + ctx.R = R; + } + R->string = str; + if( offset > size ) + return 0; + str += offset; + while( str < strend ) + { + ret = regex_test_start( str, &ctx ); + if( ret < 0 ) + return 0; + if( ret > 0 ) + return 1; + str++; + } + return 0; +} + +/* + #### srx_GetCaptureCount #### +*/ +int srx_GetCaptureCount( srx_Context* R ) +{ + return R->numcaps; +} + +/* + #### srx_GetCaptured #### +*/ +int srx_GetCaptured( srx_Context* R, int which, size_t* pbeg, size_t* pend ) +{ + const RX_Char* a, *b; + if( srx_GetCapturedPtrs( R, which, &a, &b ) ) + { + if( pbeg ) *pbeg = (size_t)( a - R->string ); + if( pend ) *pend = (size_t)( b - R->string ); + return 1; + } + return 0; +} + +/* + #### srx_GetCapturedPtrs #### +*/ +int srx_GetCapturedPtrs( srx_Context* R, int which, const RX_Char** pbeg, const RX_Char** pend ) +{ + if( which < 0 || which >= R->numcaps ) + return 0; + if( R->caps[ which ] == NULL ) + return 0; + if( pbeg ) *pbeg = R->caps[ which ]->matchbeg; + if( pend ) *pend = R->caps[ which ]->matchend; + return 1; +} + +/* + #### srx_ReplaceExt #### +*/ +RX_Char* srx_ReplaceExt( srx_Context* R, const RX_Char* str, size_t strsize, const RX_Char* rep, size_t repsize, size_t* outsize ) +{ + RX_Char* out = ""; + const RX_Char *from = str, *fromend = str + strsize, *repend = rep + repsize; + size_t size = 0, mem = 0; + +#define SR_CHKSZ( szext ) \ + if( (ptrdiff_t)( mem - size ) < (ptrdiff_t)(szext) ) \ + { \ + size_t nsz = MAX( mem * 2, size + (size_t)(szext) ); \ + RX_Char* nmem = RX_ALLOC_N( RX_Char, nsz + 1 ); \ + if( mem ) \ + { \ + memcpy( nmem, out, size + 1 ); /* copy with \0 */ \ + RX_FREE( out ); \ + } \ + out = nmem; \ + mem = nsz; \ + } +#define SR_ADDBUF( from, to ) \ + SR_CHKSZ( to - from ) \ + memcpy( out + size, from, (size_t)( to - from ) ); \ + size += (size_t)( to - from ); + + while( from < fromend ) + { + const RX_Char* ofp = NULL, *ep = NULL, *rp; + if( !srx_MatchExt( R, from, (size_t)( fromend - from ), 0 ) ) + break; + srx_GetCapturedPtrs( R, 0, &ofp, &ep ); + SR_ADDBUF( from, ofp ); + + rp = rep; + while( rp < repend ) + { + RX_Char rc = *rp; + if( ( rc == '\\' || rc == '$' ) && rp + 1 < repend ) + { + if( isdigit( rp[1] ) ) + { + int dig = rp[1] - '0'; + const RX_Char *brp, *erp; + if( srx_GetCapturedPtrs( R, dig, &brp, &erp ) ) + { + SR_ADDBUF( brp, erp ); + } + rp += 2; + continue; + } + else if( rp[1] == rc ) + { + rp++; + } + } + SR_ADDBUF( rp, rp + 1 ); + rp++; + } + + if( from == ep ) + from++; + else + from = ep; + } + + SR_ADDBUF( from, fromend ); + if( outsize ) + *outsize = size; + { + char nul[1] = {0}; + SR_ADDBUF( nul, &nul[1] ); + } + return out; +} + +/* + #### srx_FreeReplaced #### +*/ +void srx_FreeReplaced( srx_Context* R, RX_Char* repstr ) +{ + RX_FREE( repstr ); +} +
A vendor/sgregex/regex.h

@@ -0,0 +1,70 @@

+ + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stddef.h> + + +#define RXSUCCESS 0 +#define RXEINMOD -1 /* invalid modifier */ +#define RXEPART -2 /* partial (sub-)expression */ +#define RXEUNEXP -3 /* unexpected character */ +#define RXERANGE -4 /* invalid range (min > max) */ +#define RXELIMIT -5 /* too many digits */ +#define RXEEMPTY -6 /* expression is effectively empty */ +#define RXENOREF -7 /* the specified backreference cannot be used here */ + +#define RX_ALLMODS "mis" + +#ifndef RX_STRLENGTHFUNC +#define RX_STRLENGTHFUNC( str ) strlen( str ) +#endif + + +typedef void* (*srx_MemFunc) +( + void* /* userdata */, + void* /* ptr */, + size_t /* size */ +); + +#ifdef RX_NEED_DEFAULT_MEMFUNC +static void* srx_DefaultMemFunc( void* userdata, void* ptr, size_t size ) +{ + (void) userdata; + if( size ) + return realloc( ptr, size ); + free( ptr ); + return NULL; +} +#endif + +typedef char RX_Char; + +typedef struct _srx_Context srx_Context; + + +srx_Context* srx_CreateExt( const RX_Char* str, size_t strsize, const RX_Char* mods, int* errnpos, srx_MemFunc memfn, void* memctx ); +#define srx_Create( str, mods ) srx_CreateExt( str, RX_STRLENGTHFUNC(str), mods, NULL, srx_DefaultMemFunc, NULL ) +int srx_Destroy( srx_Context* R ); +void srx_DumpToStdout( srx_Context* R ); + +int srx_MatchExt( srx_Context* R, const RX_Char* str, size_t size, size_t offset ); +#define srx_Match( R, str, off ) srx_MatchExt( R, str, RX_STRLENGTHFUNC(str), off ) +int srx_GetCaptureCount( srx_Context* R ); +int srx_GetCaptured( srx_Context* R, int which, size_t* pbeg, size_t* pend ); +int srx_GetCapturedPtrs( srx_Context* R, int which, const RX_Char** pbeg, const RX_Char** pend ); + +RX_Char* srx_ReplaceExt( srx_Context* R, const RX_Char* str, size_t strsize, const RX_Char* rep, size_t repsize, size_t* outsize ); +#define srx_Replace( R, str, rep ) srx_ReplaceExt( R, str, RX_STRLENGTHFUNC(str), rep, RX_STRLENGTHFUNC(rep), NULL ) +void srx_FreeReplaced( srx_Context* R, RX_Char* repstr ); + + +#ifdef __cplusplus +} +#endif +
D vendor/slre.nim

@@ -1,125 +0,0 @@

-# -# Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com> -# All rights reserved -# -# "THE BEER-WARE LICENSE" (Revision 42): -# Sergey Lyubka wrote this file. As long as you retain this notice you -# can do whatever you want with this stuff. If we meet some day, and you think -# this stuff is worth it, you can buy me a beer in return. -# -# -# This is a regular expression library that implements a subset of Perl RE. -# Please refer to http://slre.sourceforge.net for detailed description. -# -# Usage example (parsing HTTP request): -# -# struct slre slre; -# struct cap captures[4 + 1]; // Number of braket pairs + 1 -# ... -# -# slre_compile(&slre,"^(GET|POST) (\S+) HTTP/(\S+?)\r\n"); -# -# if (slre_match(&slre, buf, len, captures)) { -# printf("Request line length: %d\n", captures[0].len); -# printf("Method: %.*s\n", captures[1].len, captures[1].ptr); -# printf("URI: %.*s\n", captures[2].len, captures[2].ptr); -# } -# -# Supported syntax: -# ^ Match beginning of a buffer -# $ Match end of a buffer -# () Grouping and substring capturing -# [...] Match any character from set -# [^...] Match any character but ones from set -# \s Match whitespace -# \S Match non-whitespace -# \d Match decimal digit -# \r Match carriage return -# \n Match newline -# + Match one or more times (greedy) -# +? Match one or more times (non-greedy) -# * Match zero or more times (greedy) -# *? Match zero or more times (non-greedy) -# ? Match zero or once -# \xDD Match byte with hex value 0xDD -# \meta Match one of the meta character: ^$().[*+?\ -# - -{.compile: "slre/libslre.c".} -# -# Compiled regular expression -# -type - slre* = object - code*: array[256, cuchar] - data*: array[256, cuchar] - code_size*: cint - data_size*: cint - num_caps*: cint # Number of bracket pairs - anchored*: cint # Must match from string start - err_str*: cstring # Error string - -# -# Captured substring -# -type - cap* = object - value*: cstring # Pointer to the substring - len*: cint # Substring length - -# -# Compile regular expression. If success, 1 is returned. -# If error, 0 is returned and slre.err_str points to the error message. -# -proc slre_compile(a2: ptr slre; re: cstring): cint {.importc.} -# -# Return 1 if match, 0 if no match. -# If `captured_substrings' array is not NULL, then it is filled with the -# values of captured substrings. captured_substrings[0] element is always -# a full matched substring. The round bracket captures start from -# captured_substrings[1]. -# It is assumed that the size of captured_substrings array is enough to -# hold all captures. The caller function must make sure it is! So, the -# array_size = number_of_round_bracket_pairs + 1 -# -proc slre_match(a2: ptr slre; buf: cstring; buf_len: cint; - captured_substrings: openarray[cap]): cint {.importc.} - -# High level API -from strutils import contains, replace, parseInt -from sequtils import delete - -proc match*(s: string, re: string): seq[string] = - var rawre = cast[ptr slre](alloc0(sizeof(slre))) - if slre_compile(rawre, re) == 1: - var matches:array[10, cap] - if rawre.slre_match(s.cstring, s.len.cint, matches) == 1: - var res = newSeq[string](0) - for i in items(matches): - if i.value != nil: - var str = $(i.value) - res.add str.substr(0, i.len-1) - return res - else: - return newSeq[string](0) - else: - raise newException(ValueError, $(rawre.err_str)) - -proc gsub*(s_find: string, re: string, s_replace: string): string = - var matches = s_find.match(re) - if matches.len > 0: - var res = s_find.replace(matches[0], s_replace) - if matches.len > 1: - # Replace captures - var caps = res.match("\\$(\\d)") - if caps.len > 1: - # Remove first (global) match - caps.delete(0, 0) - for c in caps: - var ci = parseInt(c) - # Replace $-placeholders with captures - while res.contains("$"&c): - res = res.replace("$"&c, matches[ci]) - return res - else: - return s_find
D vendor/slre/libslre.c

@@ -1,667 +0,0 @@

-/* - * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com> - * All rights reserved - * - * "THE BEER-WARE LICENSE" (Revision 42): - * Sergey Lyubka wrote this file. As long as you retain this notice you - * can do whatever you want with this stuff. If we meet some day, and you think - * this stuff is worth it, you can buy me a beer in return. - */ - -#include <stdio.h> -#include <assert.h> -#include <ctype.h> -#include <stdlib.h> -#include <string.h> -#include <errno.h> - -#include "slre.h" - -enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL, - STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT}; - -static struct { - const char *name; - int narg; - const char *flags; -} opcodes[] = { - {"END", 0, ""}, /* End of code block or program */ - {"BRANCH", 2, "oo"}, /* Alternative operator, "|" */ - {"ANY", 0, ""}, /* Match any character, "." */ - {"EXACT", 2, "d"}, /* Match exact string */ - {"ANYOF", 2, "D"}, /* Match any from set, "[]" */ - {"ANYBUT", 2, "D"}, /* Match any but from set, "[^]"*/ - {"OPEN ", 1, "i"}, /* Capture start, "(" */ - {"CLOSE", 1, "i"}, /* Capture end, ")" */ - {"BOL", 0, ""}, /* Beginning of string, "^" */ - {"EOL", 0, ""}, /* End of string, "$" */ - {"STAR", 1, "o"}, /* Match zero or more times "*" */ - {"PLUS", 1, "o"}, /* Match one or more times, "+" */ - {"STARQ", 1, "o"}, /* Non-greedy STAR, "*?" */ - {"PLUSQ", 1, "o"}, /* Non-greedy PLUS, "+?" */ - {"QUEST", 1, "o"}, /* Match zero or one time, "?" */ - {"SPACE", 0, ""}, /* Match whitespace, "\s" */ - {"NONSPACE", 0, ""}, /* Match non-space, "\S" */ - {"DIGIT", 0, ""} /* Match digit, "\d" */ -}; - -/* - * Commands and operands are all unsigned char (1 byte long). All code offsets - * are relative to current address, and positive (always point forward). Data - * offsets are absolute. Commands with operands: - * - * BRANCH offset1 offset2 - * Try to match the code block that follows the BRANCH instruction - * (code block ends with END). If no match, try to match code block that - * starts at offset1. If either of these match, jump to offset2. - * - * EXACT data_offset data_length - * Try to match exact string. String is recorded in data section from - * data_offset, and has length data_length. - * - * OPEN capture_number - * CLOSE capture_number - * If the user have passed 'struct cap' array for captures, OPEN - * records the beginning of the matched substring (cap->ptr), CLOSE - * sets the length (cap->len) for respective capture_number. - * - * STAR code_offset - * PLUS code_offset - * QUEST code_offset - * *, +, ?, respectively. Try to gobble as much as possible from the - * matched buffer, until code block that follows these instructions - * matches. When the longest possible string is matched, - * jump to code_offset - * - * STARQ, PLUSQ are non-greedy versions of STAR and PLUS. - */ - -static const char *meta_chars = "|.^$*+?()[\\"; - -static void -print_character_set(FILE *fp, const unsigned char *p, int len) -{ - int i; - - for (i = 0; i < len; i++) { - if (i > 0) - (void) fputc(',', fp); - if (p[i] == 0) { - i++; - if (p[i] == 0) - (void) fprintf(fp, "\\x%02x", p[i]); - else - (void) fprintf(fp, "%s", opcodes[p[i]].name); - } else if (isprint(p[i])) { - (void) fputc(p[i], fp); - } else { - (void) fprintf(fp,"\\x%02x", p[i]); - } - } -} - -void -slre_dump(const struct slre *r, FILE *fp) -{ - int i, j, ch, op, pc; - - for (pc = 0; pc < r->code_size; pc++) { - - op = r->code[pc]; - (void) fprintf(fp, "%3d %s ", pc, opcodes[op].name); - - for (i = 0; opcodes[op].flags[i] != '\0'; i++) - switch (opcodes[op].flags[i]) { - case 'i': - (void) fprintf(fp, "%d ", r->code[pc + 1]); - pc++; - break; - case 'o': - (void) fprintf(fp, "%d ", - pc + r->code[pc + 1] - i); - pc++; - break; - case 'D': - print_character_set(fp, r->data + - r->code[pc + 1], r->code[pc + 2]); - pc += 2; - break; - case 'd': - (void) fputc('"', fp); - for (j = 0; j < r->code[pc + 2]; j++) { - ch = r->data[r->code[pc + 1] + j]; - if (isprint(ch)) - (void) fputc(ch, fp); - else - (void) fprintf(fp,"\\x%02x",ch); - } - (void) fputc('"', fp); - pc += 2; - break; - } - - (void) fputc('\n', fp); - } -} - -static void -set_jump_offset(struct slre *r, int pc, int offset) -{ - assert(offset < r->code_size); - - if (r->code_size - offset > 0xff) { - r->err_str = "Jump offset is too big"; - } else { - r->code[pc] = (unsigned char) (r->code_size - offset); - } -} - -static void -emit(struct slre *r, int code) -{ - if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0]))) - r->err_str = "RE is too long (code overflow)"; - else - r->code[r->code_size++] = (unsigned char) code; -} - -static void -store_char_in_data(struct slre *r, int ch) -{ - if (r->data_size >= (int) sizeof(r->data)) - r->err_str = "RE is too long (data overflow)"; - else - r->data[r->data_size++] = ch; -} - -static void -exact(struct slre *r, const char **re) -{ - int old_data_size = r->data_size; - - while (**re != '\0' && (strchr(meta_chars, **re)) == NULL) - store_char_in_data(r, *(*re)++); - - emit(r, EXACT); - emit(r, old_data_size); - emit(r, r->data_size - old_data_size); -} - -static int -get_escape_char(const char **re) -{ - int res; - - switch (*(*re)++) { - case 'n': res = '\n'; break; - case 'r': res = '\r'; break; - case 't': res = '\t'; break; - case '0': res = 0; break; - case 'S': res = NONSPACE << 8; break; - case 's': res = SPACE << 8; break; - case 'd': res = DIGIT << 8; break; - default: res = (*re)[-1]; break; - } - - return (res); -} - -static void -anyof(struct slre *r, const char **re) -{ - int esc, old_data_size = r->data_size, op = ANYOF; - - if (**re == '^') { - op = ANYBUT; - (*re)++; - } - - while (**re != '\0') - - switch (*(*re)++) { - case ']': - emit(r, op); - emit(r, old_data_size); - emit(r, r->data_size - old_data_size); - return; - /* NOTREACHED */ - break; - case '\\': - esc = get_escape_char(re); - if ((esc & 0xff) == 0) { - store_char_in_data(r, 0); - store_char_in_data(r, esc >> 8); - } else { - store_char_in_data(r, esc); - } - break; - default: - store_char_in_data(r, (*re)[-1]); - break; - } - - r->err_str = "No closing ']' bracket"; -} - -static void -relocate(struct slre *r, int begin, int shift) -{ - emit(r, END); - memmove(r->code + begin + shift, r->code + begin, r->code_size - begin); - r->code_size += shift; -} - -static void -quantifier(struct slre *r, int prev, int op) -{ - if (r->code[prev] == EXACT && r->code[prev + 2] > 1) { - r->code[prev + 2]--; - emit(r, EXACT); - emit(r, r->code[prev + 1] + r->code[prev + 2]); - emit(r, 1); - prev = r->code_size - 3; - } - relocate(r, prev, 2); - r->code[prev] = op; - set_jump_offset(r, prev + 1, prev); -} - -static void -exact_one_char(struct slre *r, int ch) -{ - emit(r, EXACT); - emit(r, r->data_size); - emit(r, 1); - store_char_in_data(r, ch); -} - -static void -fixup_branch(struct slre *r, int fixup) -{ - if (fixup > 0) { - emit(r, END); - set_jump_offset(r, fixup, fixup - 2); - } -} - -static void -compile(struct slre *r, const char **re) -{ - int op, esc, branch_start, last_op, fixup, cap_no, level; - - fixup = 0; - level = r->num_caps; - branch_start = last_op = r->code_size; - - for (;;) - switch (*(*re)++) { - case '\0': - (*re)--; - return; - /* NOTREACHED */ - break; - case '^': - emit(r, BOL); - break; - case '$': - emit(r, EOL); - break; - case '.': - last_op = r->code_size; - emit(r, ANY); - break; - case '[': - last_op = r->code_size; - anyof(r, re); - break; - case '\\': - last_op = r->code_size; - esc = get_escape_char(re); - if (esc & 0xff00) { - emit(r, esc >> 8); - } else { - exact_one_char(r, esc); - } - break; - case '(': - last_op = r->code_size; - cap_no = ++r->num_caps; - emit(r, OPEN); - emit(r, cap_no); - - compile(r, re); - if (*(*re)++ != ')') { - r->err_str = "No closing bracket"; - return; - } - - emit(r, CLOSE); - emit(r, cap_no); - break; - case ')': - (*re)--; - fixup_branch(r, fixup); - if (level == 0) { - r->err_str = "Unbalanced brackets"; - return; - } - return; - /* NOTREACHED */ - break; - case '+': - case '*': - op = (*re)[-1] == '*' ? STAR: PLUS; - if (**re == '?') { - (*re)++; - op = op == STAR ? STARQ : PLUSQ; - } - quantifier(r, last_op, op); - break; - case '?': - quantifier(r, last_op, QUEST); - break; - case '|': - fixup_branch(r, fixup); - relocate(r, branch_start, 3); - r->code[branch_start] = BRANCH; - set_jump_offset(r, branch_start + 1, branch_start); - fixup = branch_start + 2; - r->code[fixup] = 0xff; - break; - default: - (*re)--; - last_op = r->code_size; - exact(r, re); - break; - } -} - -int -slre_compile(struct slre *r, const char *re) -{ - r->err_str = NULL; - r->code_size = r->data_size = r->num_caps = r->anchored = 0; - - if (*re == '^') - r->anchored++; - - emit(r, OPEN); /* This will capture what matches full RE */ - emit(r, 0); - - while (*re != '\0') - compile(r, &re); - - if (r->code[2] == BRANCH) - fixup_branch(r, 4); - - emit(r, CLOSE); - emit(r, 0); - emit(r, END); - - return (r->err_str == NULL ? 1 : 0); -} - -static int match(const struct slre *, int, - const char *, int, int *, struct cap *); - -static void -loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs) -{ - int saved_offset, matched_offset; - - saved_offset = matched_offset = *ofs; - - while (match(r, pc + 2, s, len, ofs, NULL)) { - saved_offset = *ofs; - if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL)) - matched_offset = saved_offset; - *ofs = saved_offset; - } - - *ofs = matched_offset; -} - -static void -loop_non_greedy(const struct slre *r, int pc, const char *s,int len, int *ofs) -{ - int saved_offset = *ofs; - - while (match(r, pc + 2, s, len, ofs, NULL)) { - saved_offset = *ofs; - if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL)) - break; - } - - *ofs = saved_offset; -} - -static int -is_any_of(const unsigned char *p, int len, const char *s, int *ofs) -{ - int i, ch; - - ch = s[*ofs]; - - for (i = 0; i < len; i++) - if (p[i] == ch) { - (*ofs)++; - return (1); - } - - return (0); -} - -static int -is_any_but(const unsigned char *p, int len, const char *s, int *ofs) -{ - int i, ch; - - ch = s[*ofs]; - - for (i = 0; i < len; i++) - if (p[i] == ch) - return (0); - - (*ofs)++; - return (1); -} - -static int -match(const struct slre *r, int pc, const char *s, int len, - int *ofs, struct cap *caps) -{ - int n, saved_offset, res = 1; - - while (res && r->code[pc] != END) { - - assert(pc < r->code_size); - assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0]))); - - switch (r->code[pc]) { - case BRANCH: - saved_offset = *ofs; - res = match(r, pc + 3, s, len, ofs, caps); - if (res == 0) { - *ofs = saved_offset; - res = match(r, pc + r->code[pc + 1], - s, len, ofs, caps); - } - pc += r->code[pc + 2]; - break; - case EXACT: - res = 0; - n = r->code[pc + 2]; /* String length */ - if (n <= len - *ofs && !memcmp(s + *ofs, r->data + - r->code[pc + 1], n)) { - (*ofs) += n; - res = 1; - } - pc += 3; - break; - case QUEST: - res = 1; - saved_offset = *ofs; - if (!match(r, pc + 2, s, len, ofs, caps)) - *ofs = saved_offset; - pc += r->code[pc + 1]; - break; - case STAR: - res = 1; - loop_greedy(r, pc, s, len, ofs); - pc += r->code[pc + 1]; - break; - case STARQ: - res = 1; - loop_non_greedy(r, pc, s, len, ofs); - pc += r->code[pc + 1]; - break; - case PLUS: - if ((res = match(r, pc + 2, s, len, ofs, caps)) == 0) - break; - - loop_greedy(r, pc, s, len, ofs); - pc += r->code[pc + 1]; - break; - case PLUSQ: - if ((res = match(r, pc + 2, s, len, ofs, caps)) == 0) - break; - - loop_non_greedy(r, pc, s, len, ofs); - pc += r->code[pc + 1]; - break; - case SPACE: - res = 0; - if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) { - (*ofs)++; - res = 1; - } - pc++; - break; - case NONSPACE: - res = 0; - if (*ofs <len && !isspace(((unsigned char *)s)[*ofs])) { - (*ofs)++; - res = 1; - } - pc++; - break; - case DIGIT: - res = 0; - if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) { - (*ofs)++; - res = 1; - } - pc++; - break; - case ANY: - res = 0; - if (*ofs < len) { - (*ofs)++; - res = 1; - } - pc++; - break; - case ANYOF: - res = 0; - if (*ofs < len) - res = is_any_of(r->data + r->code[pc + 1], - r->code[pc + 2], s, ofs); - pc += 3; - break; - case ANYBUT: - res = 0; - if (*ofs < len) - res = is_any_but(r->data + r->code[pc + 1], - r->code[pc + 2], s, ofs); - pc += 3; - break; - case BOL: - res = *ofs == 0 ? 1 : 0; - pc++; - break; - case EOL: - res = *ofs == len ? 1 : 0; - pc++; - break; - case OPEN: - if (caps != NULL) - caps[r->code[pc + 1]].ptr = s + *ofs; - pc += 2; - break; - case CLOSE: - if (caps != NULL) - caps[r->code[pc + 1]].len = (s + *ofs) - - caps[r->code[pc + 1]].ptr; - pc += 2; - break; - case END: - pc++; - break; - default: - printf("unknown cmd (%d) at %d\n", r->code[pc], pc); - assert(0); - break; - } - } - - return (res); -} - -int -slre_match(const struct slre *r, const char *buf, int len, - struct cap *caps) -{ - int i, ofs = 0, res = 0; - - if (r->anchored) { - res = match(r, 0, buf, len, &ofs, caps); - } else { - for (i = 0; i < len && res == 0; i++) { - ofs = i; - res = match(r, 0, buf, len, &ofs, caps); - } - } - - return (res); -} - -#ifdef TEST -int main(int argc, char *argv[]) -{ - struct slre slre; - struct cap caps[20]; - char data[1 * 1024 * 1024]; - FILE *fp; - int i, count, res, len; - - if (argc < 3) { - printf("Usage: %s 'slre' <file> [count]\n", argv[0]); - } else if ((fp = fopen(argv[2], "rb")) == NULL) { - printf("Error: cannot open %s:%s\n", argv[2], strerror(errno)); - } else if (!slre_compile(&slre, argv[1])) { - printf("Error compiling slre: %s\n", slre.err_str); - } else { - slre_dump(&slre, stderr); - - (void) memset(caps, 0, sizeof(caps)); - - /* Read first 128K of file */ - len = fread(data, 1, sizeof(data), fp); - (void) fclose(fp); - - res = 0; - count = argc > 3 ? atoi(argv[3]) : 1; - for (i = 0; i < count; i++) - res = slre_match(&slre, data, len, caps); - - printf("Result: %d\n", res); - - for (i = 0; i < 20; i++) - if (caps[i].len > 0) - printf("Substring %d: [%.*s]\n", i, - caps[i].len, caps[i].ptr); - } - - return (0); -} -#endif /* TEST */
D vendor/slre/slre.h

@@ -1,92 +0,0 @@

-/* - * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com> - * All rights reserved - * - * "THE BEER-WARE LICENSE" (Revision 42): - * Sergey Lyubka wrote this file. As long as you retain this notice you - * can do whatever you want with this stuff. If we meet some day, and you think - * this stuff is worth it, you can buy me a beer in return. - */ - -/* - * This is a regular expression library that implements a subset of Perl RE. - * Please refer to http://slre.sourceforge.net for detailed description. - * - * Usage example (parsing HTTP request): - * - * struct slre slre; - * struct cap captures[4 + 1]; // Number of braket pairs + 1 - * ... - * - * slre_compile(&slre,"^(GET|POST) (\S+) HTTP/(\S+?)\r\n"); - * - * if (slre_match(&slre, buf, len, captures)) { - * printf("Request line length: %d\n", captures[0].len); - * printf("Method: %.*s\n", captures[1].len, captures[1].ptr); - * printf("URI: %.*s\n", captures[2].len, captures[2].ptr); - * } - * - * Supported syntax: - * ^ Match beginning of a buffer - * $ Match end of a buffer - * () Grouping and substring capturing - * [...] Match any character from set - * [^...] Match any character but ones from set - * \s Match whitespace - * \S Match non-whitespace - * \d Match decimal digit - * \r Match carriage return - * \n Match newline - * + Match one or more times (greedy) - * +? Match one or more times (non-greedy) - * * Match zero or more times (greedy) - * *? Match zero or more times (non-greedy) - * ? Match zero or once - * \xDD Match byte with hex value 0xDD - * \meta Match one of the meta character: ^$().[*+?\ - */ - -#ifndef SLRE_HEADER_DEFINED -#define SLRE_HEADER_DEFINED - -/* - * Compiled regular expression - */ -struct slre { - unsigned char code[256]; - unsigned char data[256]; - int code_size; - int data_size; - int num_caps; /* Number of bracket pairs */ - int anchored; /* Must match from string start */ - const char *err_str; /* Error string */ -}; - -/* - * Captured substring - */ -struct cap { - const char *ptr; /* Pointer to the substring */ - int len; /* Substring length */ -}; - -/* - * Compile regular expression. If success, 1 is returned. - * If error, 0 is returned and slre.err_str points to the error message. - */ -int slre_compile(struct slre *, const char *re); - -/* - * Return 1 if match, 0 if no match. - * If `captured_substrings' array is not NULL, then it is filled with the - * values of captured substrings. captured_substrings[0] element is always - * a full matched substring. The round bracket captures start from - * captured_substrings[1]. - * It is assumed that the size of captured_substrings array is enough to - * hold all captures. The caller function must make sure it is! So, the - * array_size = number_of_round_bracket_pairs + 1 - */ -int slre_match(const struct slre *, const char *buf, int buf_len, - struct cap *captured_substrings); - -#endif /* SLRE_HEADER_DEFINED */