code.H3RALD.com — min: a0598c9364e04d2c6691d8620e379fe7e82a9562

Replaced SLRE with SGRegex.

h3rald h3rald@h3rald.com

Sat, 28 May 2016 22:20:05 +0200

commit

a0598c9364e04d2c6691d8620e379fe7e82a9562

parent

fbacd017237284143204d32aeb792d861a6c8f47

7 files changed, 1288 insertions(+), 884 deletions(-)

jump to

core/regex.nim

vendor/sgregex.nim

vendor/sgregex/libregex.c

vendor/sgregex/regex.h

A core/regex.nim

@@ -0,0 +1,60 @@ 
+import strutils
+import ../vendor/sgregex
+
+
+proc match*(str, pattern, mods: string): bool =
+  let r = srx_Create(pattern, mods)
+  result = srx_Match(r, str, 0) == 1
+  discard srx_Destroy(r)
+
+proc match*(str, pattern: string): bool =
+  return match(str, pattern, "")
+
+proc search*(str, pattern, mods: string): seq[string] =
+  let r = srx_Create(pattern, mods)
+  discard srx_Match(r, str, 0) == 1
+  let count = srx_GetCaptureCount(r)
+  result = newSeq[string](count)
+  for i in 0..count-1:
+    var first = 0
+    var last = 0
+    discard srx_GetCaptured(r, i, addr first, addr last)
+    result[i] = str.substr(first, last-1)
+  discard srx_Destroy(r)
+
+proc search*(str, pattern: string): seq[string] =
+  return search(str, pattern, "")
+
+proc replace*(str, pattern, repl, mods: string): string =
+  var r = srx_Create(pattern, mods)
+  result = $srx_Replace(r, str, repl)
+  discard srx_Destroy(r)
+
+proc replace*(str, pattern, repl: string): string =
+  return replace(str, pattern, repl, "")
+
+when isMainModule:
+
+  proc tmatch(str, pattern: string) =
+    echo str, " =~ ", "/", pattern, "/", " -> ", str.match(pattern)
+
+  proc tsearch(str, pattern: string) =
+    echo str, " =~ ", "/", pattern, "/", " -> ", str.search(pattern)
+
+  proc tsearch(str, pattern, mods: string) =
+    echo str, " =~ ", "/", pattern, "/", mods, " -> ", str.search(pattern, mods)
+
+  proc treplace(str, pattern, repl: string) =
+    echo str, " =~ ", "s/", pattern, "/", repl, "/", " -> ", str.replace(pattern, repl)
+
+
+  "HELLO".tmatch("^H(.*)O$")
+  "HELLO".tmatch("^H(.*)S$")
+  "HELLO".tsearch("^H(E)(.*)O$")
+  "Hello, World!".treplace("[a-zA-Z]+,", "Goodbye,")
+  "127.0.0.1".tsearch("^([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})$")
+  "127.0.0.1".treplace("^([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})$", "$4.$3.$1.$2")
+  "127.0.0.1".treplace("[0-9]+", "255")
+  "Hello".tsearch("HELLO", "i")
+  "Hello\nWorld!".tsearch("HELLO.WORLD", "mis")
+

A vendor/sgregex.nim

@@ -0,0 +1,57 @@ 
+{.compile: "sgregex/libregex.c".}
+const 
+  RXSUCCESS* = 0
+  RXEINMOD* = - 1
+  RXEPART* = - 2
+  RXEUNEXP* = - 3
+  RXERANGE* = - 4
+  RXELIMIT* = - 5
+  RXEEMPTY* = - 6
+  RXENOREF* = - 7
+  RX_ALLMODS* = "mis"
+
+
+type 
+  srx_MemFunc* = proc (a2: pointer; a3: pointer; a4: csize): pointer
+
+proc RX_STRLENGTHFUNC*(str: string): int = 
+  return str.len
+
+proc srx_DefaultMemFunc*(userdata: pointer, ptr1: pointer, size: csize): pointer = 
+  #cast[ptr string](userdata)
+  #if not size.isNil: 
+  #return realloc(ptr, size)
+  #free(ptr1)
+  return nil
+
+type 
+  srx_Context* = object
+
+{.push importc.}
+proc srx_CreateExt*(str: cstring; strsize: csize; mods: cstring; 
+                    errnpos: ptr cint; memfn: srx_MemFunc; memctx: pointer): ptr srx_Context
+template srx_Create*(str, mods: expr): expr = 
+  srx_CreateExt(str, RX_STRLENGTHFUNC(str), mods, nil, nil, nil)#srx_DefaultMemFunc, nil)
+
+proc srx_Destroy*(R: ptr srx_Context): cint
+proc srx_DumpToStdout*(R: ptr srx_Context)
+proc srx_MatchExt*(R: ptr srx_Context; str: cstring; size: csize; 
+                   offset: csize): cint
+template srx_Match*(R, str, off: expr): expr = 
+  srx_MatchExt(R, str, RX_STRLENGTHFUNC(str), off)
+
+proc srx_GetCaptureCount*(R: ptr srx_Context): cint
+proc srx_GetCaptured*(R: ptr srx_Context; which: cint; pbeg: ptr csize; 
+                      pend: ptr csize): cint
+proc srx_GetCapturedPtrs*(R: ptr srx_Context; which: cint; 
+                          pbeg: cstringArray; pend: cstringArray): cint
+proc srx_ReplaceExt*(R: ptr srx_Context; str: cstring; strsize: csize; 
+                     rep: cstring; repsize: csize; outsize: ptr csize): cstring
+template srx_Replace*(R, str, rep: expr): expr = 
+  srx_ReplaceExt(R, str, RX_STRLENGTHFUNC(str), rep, RX_STRLENGTHFUNC(rep), nil)
+
+proc srx_FreeReplaced*(R: ptr srx_Context; repstr: cstring)
+
+
+
+

A vendor/sgregex/libregex.c

@@ -0,0 +1,1101 @@ 
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#define MAX(a,b) ((a)>(b)?(a):(b))
+
+
+#define RX_NEED_DEFAULT_MEMFUNC
+#include "regex.h"
+
+
+#define RX_MAX_CAPTURES 10
+
+
+#define RX_MALLOC( bytes ) R->memfn( R->memctx, NULL, bytes )
+#define RX_ALLOC_N( what, N ) (what*) R->memfn( R->memctx, NULL, sizeof( what ) * ((size_t)(N)) )
+#define RX_ALLOC( what ) RX_ALLOC_N( what, 1 )
+#define RX_FREE( ptr ) R->memfn( R->memctx, ptr, 0 )
+
+#define RX_IS_ALPHA( x ) rx_isalpha( x )
+#define RX_EQUALIZE( x ) rx_tolower( x )
+
+
+#define RIT_MATCH  1 /* matching */
+#define RIT_RANGE  2
+#define RIT_SPCBEG 3
+#define RIT_SPCEND 4
+#define RIT_BKREF  5
+#define RIT_EITHER 11 /* control */
+#define RIT_SUBEXP 12
+
+#define RIF_LAZY   0x01
+#define RIF_INVERT 0x02
+
+#define RCF_MULTILINE 0x01 /* ^/$ matches beginning/end of line too */
+#define RCF_CASELESS  0x02 /* pre-equalized case for match/range */
+#define RCF_DOTALL    0x04 /* "." is compiled as "[^]" instead of "[^\r\n]" */
+
+#ifndef RXLOG
+#define RXLOG 0
+#endif
+
+#if RXLOG
+#define RXLOGINFO( x ) x
+#else
+#define RXLOGINFO( x )
+#endif
+#define RX_LOGLIM( str, strend, off ) (int)((strend)-(str)<(off)?(strend)-(str):(off)), str
+
+
+static int rx_isalpha( RX_Char c )
+{
+	return ( c >= 'a' && c <= 'z' )
+	    || ( c >= 'A' && c <= 'Z' );
+}
+
+static RX_Char rx_tolower( RX_Char c )
+{
+	if( c >= 'A' && c <= 'Z' )
+		return (RX_Char)( c - 'A' + 'a' );
+	return c;
+}
+
+
+typedef struct _regex_item regex_item;
+struct _regex_item
+{
+	/* structure */
+	regex_item* prev;
+	regex_item* next;
+	regex_item* ch, *ch2;
+	regex_item* pos;
+	
+	RX_Char* range;
+	int count;
+	
+	int type, flags;
+	RX_Char a;
+	int min, max;
+	
+	/* match state */
+	const RX_Char *matchbeg, *matchend;
+	int counter;
+};
+
+struct _srx_Context
+{
+	/* structure */
+	regex_item*    root;
+	int            flags;
+	
+	/* memory */
+	srx_MemFunc    memfn;
+	void*          memctx;
+	
+	/* captures */
+	regex_item*    caps[ RX_MAX_CAPTURES ];
+	int            numcaps;
+	
+	/* temporary data */
+	const RX_Char* string;
+	const RX_Char* stringend;
+};
+
+typedef struct _match_ctx
+{
+	const RX_Char* string;
+	const RX_Char* stringend;
+	regex_item*    item;
+	srx_Context*   R;
+}
+match_ctx;
+
+
+static int regex_test( const RX_Char* str, match_ctx* ctx );
+
+
+static int regex_match_once( match_ctx* ctx )
+{
+	int i;
+	regex_item* item = ctx->item;
+	const RX_Char* str = item->matchend;
+	RXLOGINFO( printf( "type %d char %d('%c') action at %p (%.*s)\n",
+		item->type, (int) item->a, item->a, str, RX_LOGLIM(str,ctx->stringend,5) ) );
+	switch( item->type )
+	{
+	case RIT_MATCH:
+		if( str >= ctx->stringend )
+			break;
+		{
+			RX_Char ch = *str;
+			if( ctx->R->flags & RCF_CASELESS )
+				ch = RX_EQUALIZE( *str );
+			if( ch == item->a )
+			{
+				item->matchend++;
+				return 1;
+			}
+		}
+		break;
+	case RIT_RANGE:
+		if( str >= ctx->stringend )
+			break;
+		{
+			RX_Char ch = *str;
+			int inv = ( item->flags & RIF_INVERT ) != 0, inrange = 0;
+			if( ctx->R->flags & RCF_CASELESS )
+				ch = RX_EQUALIZE( *str );
+			for( i = 0; i < item->count*2; i += 2 )
+			{
+				if( ch >= item->range[i] && ch <= item->range[i+1] )
+				{
+					inrange = 1;
+					break;
+				}
+			}
+			if( inrange ^ inv )
+			{
+				item->matchend++;
+				return 1;
+			}
+		}
+		break;
+	case RIT_SPCBEG:
+		if( ctx->R->flags & RCF_MULTILINE && item->matchend < ctx->stringend && ( *item->matchend == '\n' || *item->matchend == '\r' ) )
+		{
+			if( *item->matchend == '\r' && item->matchend[1] == '\n' )
+				item->matchend++;
+			item->matchend++;
+			item->matchbeg = item->matchend;
+			return 1;
+		}
+		return ctx->string == item->matchend;
+	case RIT_SPCEND:
+		if( ctx->R->flags & RCF_MULTILINE && item->matchend < ctx->stringend && ( *item->matchend == '\n' || *item->matchend == '\r' ) )
+		{
+			return 1;
+		}
+		return str >= ctx->stringend;
+	case RIT_BKREF:
+		{
+			regex_item* cap = ctx->R->caps[ (int) item->a ];
+			ptrdiff_t len = cap->matchend - cap->matchbeg;
+			ptrdiff_t len2 = ctx->stringend - str;
+			if( len2 >= len && memcmp( cap->matchbeg, str, (size_t) len ) == 0 )
+			{
+				item->matchend += len;
+				return 1;
+			}
+		}
+		break;
+	case RIT_SUBEXP:
+		{
+			match_ctx cc;
+			{
+				cc.string = ctx->string;
+				cc.stringend = ctx->stringend;
+				cc.item = item->pos ? item->pos : item->ch;
+				cc.R = ctx->R;
+			}
+			if( regex_test( str, &cc ) )
+			{
+				regex_item* p = item->ch;
+				while( p->next )
+					p = p->next;
+				item->pos = NULL;
+				item->matchend = p->matchend;
+				return 1;
+			}
+		}
+		break;
+	}
+	return 0;
+}
+
+static int regex_match_many( match_ctx* ctx )
+{
+	/* returns whether matched */
+	regex_item* item = ctx->item;
+	item->matchend = item->matchbeg;
+	if( item->type == RIT_EITHER )
+	{
+		regex_item* chi = item->counter ? item->ch2 : item->ch;
+		match_ctx cc;
+		{
+			cc.string = ctx->string;
+			cc.stringend = ctx->stringend;
+			cc.item = chi;
+			cc.R = ctx->R;
+		}
+		if( regex_test( item->matchbeg, &cc ) )
+		{
+			regex_item* p = chi;
+			while( p->next )
+				p = p->next;
+			item->matchend = p->matchend;
+			return 1;
+		}
+		return 0;
+	}
+	else
+	{
+		int i;
+		for( i = 0; i < item->counter; ++i )
+		{
+			if( item->matchend >= ctx->stringend && item->type != RIT_SPCEND && item->type != RIT_EITHER && item->type != RIT_SUBEXP )
+			{
+				item->counter = item->flags & RIF_LAZY ? item->max : i;
+				RXLOGINFO( printf( "stopped while matching, counter = %d, %d between %d and %d?\n", item->counter, i, item->min, item->max ) );
+				return i >= item->min && i <= item->max;
+			}
+			if( !regex_match_once( ctx ) )
+			{
+				item->counter = item->flags & RIF_LAZY ? item->max : i;
+				RXLOGINFO( printf( "did not match, counter reset to %d\n", item->counter ) );
+				return i >= item->min && i <= item->max;
+			}
+			RXLOGINFO( else printf( "matched\n" ) );
+		}
+		return 1;
+	}
+}
+
+static void regex_full_reset( regex_item* p );
+static void regex_reset_one( regex_item* p )
+{
+	if( p->ch ) regex_full_reset( p->ch );
+	if( p->ch2 ) regex_full_reset( p->ch2 );
+	p->pos = p->ch;
+	p->matchbeg = p->matchend = NULL;
+	p->counter = p->flags & RIF_LAZY ? p->min : p->max;
+}
+static void regex_full_reset( regex_item* p )
+{
+	while( p )
+	{
+		regex_reset_one( p );
+		p = p->next;
+	}
+}
+
+static regex_item* regex_lastch( regex_item* item )
+{
+	regex_item* p = item->ch;
+	while( p && p->next )
+		p = p->next;
+	return p;
+}
+
+static int regex_subexp_backtrack( regex_item* item )
+{
+	int chgh = 0;
+	regex_item* p = item->pos ? item->pos : regex_lastch( item );
+	
+	while( p )
+	{
+		RXLOGINFO( printf( "backtracker at type %d char %d\n", p->type, (int) p->a ) );
+		if( chgh && p->type == RIT_SUBEXP && regex_subexp_backtrack( p ) )
+			break;
+		else if( p->flags & RIF_LAZY )
+		{
+			p->counter++;
+			if( p->counter <= p->max )
+				break;
+		}
+		else
+		{
+			p->counter--;
+			if( p->counter >= p->min )
+				break;
+		}
+		RXLOGINFO( printf( "subexp backtrack - reset current, move back\n" ) );
+		regex_reset_one( p );
+		p = p->prev;
+		chgh = 1;
+	}
+	
+	RXLOGINFO( printf( "subexp backtrack - %s\n", p ? "success" : "failure" ) );
+	RXLOGINFO( if( p ) printf( "subexp-backtracked to type %d ctr=%d min=%d max=%d\n", p->type, p->counter, p->min, p->max ) );
+	
+	return !!p;
+}
+
+static int regex_test( const RX_Char* str, match_ctx* ctx )
+{
+	regex_item* p = ctx->item;
+	p->matchbeg = str;
+	
+	for(;;)
+	{
+		int res;
+		match_ctx cc;
+		{
+			cc.string = ctx->string;
+			cc.stringend = ctx->stringend;
+			cc.item = p;
+			cc.R = ctx->R;
+		}
+		RXLOGINFO( printf( "match_many: item %p type %d at position %p (%.*s)\n",
+			(void*) p, p->type, p->matchbeg, RX_LOGLIM(p->matchbeg,ctx->stringend,5) ) );
+		res = regex_match_many( &cc );
+		if( res )
+		{
+			p = p->next;
+			if( !p )
+			{
+				RXLOGINFO( printf( "test of subexp %p SUCCEEDED\n", (void*) ctx->item ) );
+				return 1;
+			}
+			RXLOGINFO( printf( "moving on to type %d action\n", p->type ) );
+			p->matchbeg = p->prev->matchend;
+		}
+		else
+		{
+			int chgh = 0;
+			while( p )
+			{
+				if( chgh && p->type == RIT_SUBEXP && regex_subexp_backtrack( p ) )
+					break;
+				else if( p->flags & RIF_LAZY )
+				{
+					p->counter++;
+					if( p->counter <= p->max )
+						break;
+				}
+				else
+				{
+					p->counter--;
+					if( p->counter >= p->min )
+						break;
+				}
+				RXLOGINFO( printf( "backtrack, reset current\n" ) );
+				regex_reset_one( p );
+				p = p->prev;
+				chgh = 1;
+			}
+			if( !p )
+			{
+				RXLOGINFO( printf( "test of subexp %p BT-ENDED\n", (void*) ctx->item ) );
+				return 0;
+			}
+		}
+	}
+}
+
+static int regex_test_start( const RX_Char* str, match_ctx* ctx )
+{
+	regex_item* p = ctx->item;
+	RXLOGINFO( printf( "test start - counter reset\n" ) );
+	regex_reset_one( p );
+	return regex_test( str, ctx );
+}
+
+
+/*
+	mapping:
+	- [^a-zA-Z] ... RIT_RANGE, optional RIF_INVERT
+	- "." ... empty RIT_RANGE + RIF_INVERT
+	- "\s" and others ... predefined RIT_RANGE with optional RIF_INVERT
+	- "|" ... RIT_EITHER
+	- "(..)" ... RIT_SUBEXP
+	- "?" ... range = [0,1]
+	- "*" ... range = [0,INT_MAX]
+	- "+" ... range = [1,INT_MAX]
+	- "{1,5}" ... range = [1,5] (other ranges mapped similarly)
+	- "^" ... RIT_SPCBEG
+	- "$" ... RIT_SPCEND
+	- "\1" ... RIT_BKREF
+*/
+
+static void regex_free_item( srx_Context* R, regex_item* item );
+static void regex_dealloc_item( srx_Context* R, regex_item* item )
+{
+	if( item->range )
+		RX_FREE( item->range );
+	if( item->ch ) regex_free_item( R, item->ch );
+	if( item->ch2 ) regex_free_item( R, item->ch2 );
+	RX_FREE( item );
+}
+
+static void regex_free_item( srx_Context* R, regex_item* item )
+{
+	regex_item *p, *c;
+	if( !item )
+		return;
+	p = item->prev;
+	while( p )
+	{
+		c = p;
+		p = p->prev;
+		regex_dealloc_item( R, c );
+	}
+	p = item->next;
+	while( p )
+	{
+		c = p;
+		p = p->next;
+		regex_dealloc_item( R, c );
+	}
+	regex_dealloc_item( R, item );
+}
+
+static void regex_level( regex_item** pitem )
+{
+	/* TODO: balanced/non-(pseudo-)binary leveling */
+	regex_item* item = *pitem;
+	while( item )
+	{
+		if( item->type == RIT_EITHER )
+		{
+			regex_item* next = item->next;
+			regex_level( &next );
+			
+			if( item->prev )
+			{
+				item->prev->next = NULL;
+				item->prev = NULL;
+			}
+			if( item->next )
+			{
+				item->next->prev = NULL;
+				item->next = NULL;
+			}
+			
+			item->ch = *pitem;
+			item->ch2 = next;
+			
+			*pitem = item;
+			return;
+		}
+		item = item->next;
+	}
+}
+
+static int regex_real_compile( srx_Context* R, int* cel, const RX_Char** pstr, const RX_Char* pend, int sub, regex_item** out )
+{
+#define _RX_ALLOC_NODE( ty ) \
+	item = RX_ALLOC( regex_item ); \
+	memset( item, 0, sizeof(*item) ); \
+	if( citem ) \
+	{ \
+		citem->next = item; \
+		item->prev = citem; \
+	} \
+	item->type = ty; \
+	item->min = 1; \
+	item->max = 1;
+
+#define _RXE( err ) for(;;){ error = err; goto fail; }
+	
+	const RX_Char* s = *pstr;
+	regex_item* item = NULL, *citem = NULL;
+	int error = 0;
+	while( s < pend )
+	{
+		if( sub && *s == ')' )
+			break;
+		switch( *s )
+		{
+		case '[':
+			{
+				const RX_Char* sc;
+				int inv = 0, cnt = 0;
+				RX_Char* ri;
+				s++;
+				if( *s == '^' )
+				{
+					inv = 1;
+					s++;
+				}
+				sc = s;
+				if( *sc == ']' )
+				{
+					sc++;
+					cnt++;
+				}
+				while( *sc && *sc != ']' )
+				{
+					if( *sc == '-' && sc > s && sc[1] != 0 && sc[1] != ']' )
+						sc++;
+					else
+						cnt++;
+					sc++;
+				}
+				if( !*sc )
+					_RXE( RXEPART );
+				_RX_ALLOC_NODE( RIT_RANGE );
+				if( inv )
+					item->flags |= RIF_INVERT;
+				item->range = ri = RX_ALLOC_N( RX_Char, cnt * 2 );
+				item->count = cnt;
+				sc = s;
+				if( *sc == ']' )
+				{
+					sc++;
+					ri[0] = ri[1] = *sc;
+					ri += 2;
+				}
+				while( *sc && *sc != ']' )
+				{
+					if( *sc == '-' && sc > s && sc[1] != 0 && sc[1] != ']' )
+					{
+						if( ri > item->range )
+							*(ri-1) = sc[1];
+						sc++;
+					}
+					else
+					{
+						ri[0] = ri[1] = *sc;
+						ri += 2;
+					}
+					sc++;
+				}
+				s = sc;
+				if( *s == ']' )
+					s++;
+				if( R->flags & RCF_CASELESS )
+				{
+					int i;
+					ri = item->range;
+					for( i = 0; i < cnt * 2; i += 2 )
+					{
+						RX_Char A = ri[ i ], B = ri[ i + 1 ];
+						if( RX_IS_ALPHA( A ) && RX_IS_ALPHA( B ) )
+						{
+							ri[ i ] = RX_EQUALIZE( A );
+							ri[ i + 1 ] = RX_EQUALIZE( B );
+						}
+					}
+				}
+			}
+			break;
+		case ']':
+			_RXE( RXEUNEXP );
+		case '(':
+			{
+				int r, cap = R->numcaps < RX_MAX_CAPTURES ? 1 : -1;
+				_RX_ALLOC_NODE( RIT_SUBEXP );
+				if( cap >= 0 )
+				{
+					cap = R->numcaps++;
+					R->caps[ cap ] = item;
+				}
+				s++;
+				r = regex_real_compile( R, cel, &s, pend, 1, &item->ch );
+				if( r )
+					_RXE( r );
+				item->pos = item->ch;
+				if( *s != ')' )
+					_RXE( RXEUNEXP );
+				if( cap >= 0 )
+					cel[ cap ] = 1;
+				s++;
+			}
+			break;
+		case ')':
+			_RXE( RXEUNEXP );
+		case '{':
+		case '?':
+		case '*':
+		case '+':
+			if( s > *pstr && ( *(s-1) == '}' || *(s-1) == '?' || *(s-1) == '*' || *(s-1) == '+' ) )
+			{
+				if( *s == '?' )
+					item->flags |= RIF_LAZY;
+				else
+					_RXE( RXEUNEXP );
+			}
+			else if( item && ( item->type == RIT_MATCH || item->type == RIT_RANGE || item->type == RIT_BKREF || item->type == RIT_SUBEXP ) )
+			{
+				int min = 1, max = 1;
+				if( *s == '{' )
+				{
+					int ctr;
+					s++;
+					if( !isdigit( *s ) )
+						_RXE( RXEUNEXP );
+					min = 0;
+					ctr = 8;
+					while( isdigit( *s ) && ctr > 0 )
+					{
+						min = min * 10 + *s++ - '0';
+						ctr--;
+					}
+					if( isdigit( *s ) && ctr == 0 )
+						_RXE( RXELIMIT );
+					if( *s == ',' )
+					{
+						if( !isdigit(s[1]) )
+							_RXE( RXEUNEXP );
+						s++;
+						max = 0;
+						ctr = 8;
+						while( isdigit( *s ) && ctr > 0 )
+						{
+							max = max * 10 + *s++ - '0';
+							ctr--;
+						}
+						if( isdigit( *s ) && ctr == 0 )
+							_RXE( RXELIMIT );
+						if( min > max )
+							_RXE( RXERANGE );
+					}
+					else
+						max = min;
+					if( *s != '}' )
+						_RXE( RXEUNEXP );
+				}
+				else if( *s == '?' ){ min = 0; max = 1; }
+				else if( *s == '*' ){ min = 0; max = INT_MAX - 1; }
+				else if( *s == '+' ){ min = 1; max = INT_MAX - 1; }
+				item->min = min;
+				item->max = max;
+			}
+			else
+				_RXE( RXEUNEXP );
+			s++;
+			break;
+		case '}':
+			_RXE( RXEUNEXP );
+		case '|':
+			if( !citem )
+				_RXE( RXEUNEXP );
+			_RX_ALLOC_NODE( RIT_EITHER );
+			item->min = 0;
+			item->max = 1;
+			item->flags |= RIF_LAZY;
+			s++;
+			break;
+		case '^':
+			_RX_ALLOC_NODE( RIT_SPCBEG );
+			s++;
+			break;
+		case '$':
+			_RX_ALLOC_NODE( RIT_SPCEND );
+			s++;
+			break;
+		case '\\':
+			if( s[1] )
+			{
+				s++;
+				if( *s == '.' )
+				{
+					_RX_ALLOC_NODE( RIT_MATCH );
+					item->a = *s++;
+					break;
+				}
+				else if( isdigit( *s ) )
+				{
+					int dig = *s++ - '0';
+					if( dig == 0 || dig >= RX_MAX_CAPTURES || !cel[ dig ] )
+						_RXE( RXENOREF );
+					_RX_ALLOC_NODE( RIT_BKREF );
+					item->a = (RX_Char) dig;
+					break;
+				}
+				else if( *s == 'd' || *s == 'D' )
+				{
+					_RX_ALLOC_NODE( RIT_RANGE );
+					item->range = RX_ALLOC_N( RX_Char, 2 );
+					item->count = 1;
+					item->range[0] = '0';
+					item->range[1] = '9';
+					if( *s == 'D' )
+						item->flags |= RIF_INVERT;
+					s++;
+					break;
+				}
+				else if( *s == 'h' || *s == 'H' )
+				{
+					_RX_ALLOC_NODE( RIT_RANGE );
+					item->range = RX_ALLOC_N( RX_Char, 2 * 2 );
+					item->count = 2;
+					item->range[0] = item->range[1] = '\t';
+					item->range[2] = item->range[3] = ' ';
+					if( *s == 'H' )
+						item->flags |= RIF_INVERT;
+					s++;
+					break;
+				}
+				else if( *s == 'v' || *s == 'V' )
+				{
+					_RX_ALLOC_NODE( RIT_RANGE );
+					item->range = RX_ALLOC_N( RX_Char, 2 );
+					item->count = 1;
+					item->range[0] = 0x0A;
+					item->range[1] = 0x0D;
+					if( *s == 'V' )
+						item->flags |= RIF_INVERT;
+					s++;
+					break;
+				}
+				else if( *s == 's' || *s == 'S' )
+				{
+					_RX_ALLOC_NODE( RIT_RANGE );
+					item->range = RX_ALLOC_N( RX_Char, 2 * 2 );
+					item->count = 2;
+					item->range[0] = 0x09;
+					item->range[1] = 0x0D;
+					item->range[2] = item->range[3] = ' ';
+					if( *s == 'S' )
+						item->flags |= RIF_INVERT;
+					s++;
+					break;
+				}
+				else if( *s == 'w' || *s == 'W' )
+				{
+					_RX_ALLOC_NODE( RIT_RANGE );
+					item->range = RX_ALLOC_N( RX_Char, 2 * 4 );
+					item->count = 4;
+					item->range[0] = 'a'; item->range[1] = 'z';
+					item->range[2] = 'A'; item->range[3] = 'Z';
+					item->range[4] = '0'; item->range[5] = '9';
+					item->range[6] = item->range[7] = '_';
+					if( *s == 'W' )
+						item->flags |= RIF_INVERT;
+					s++;
+					break;
+				}
+				/* TODO: more character classes */
+			}
+			else
+				_RXE( RXEPART );
+		default:
+			if( *s == '.' )
+			{
+				_RX_ALLOC_NODE( RIT_RANGE );
+				if( !( R->flags & RCF_DOTALL ) )
+				{
+					item->range = RX_ALLOC_N( RX_Char, 2 * 2 );
+					item->range[0] = item->range[1] = '\n';
+					item->range[2] = item->range[3] = '\r';
+					item->count = 2;
+				}
+				item->flags |= RIF_INVERT;
+			}
+			else
+			{
+				_RX_ALLOC_NODE( RIT_MATCH );
+				item->a = *s;
+				if( R->flags & RCF_CASELESS && RX_IS_ALPHA( item->a ) )
+					item->a = RX_EQUALIZE( item->a );
+			}
+			s++;
+			break;
+		}
+		citem = item;
+	}
+	if( !item )
+		_RXE( RXEEMPTY );
+	if( item->type == RIT_EITHER )
+		_RXE( RXEPART );
+	*pstr = s;
+	while( item->prev )
+		item = item->prev;
+	regex_level( &item );
+	*out = item;
+	return RXSUCCESS;
+fail:
+	regex_free_item( R, item );
+	return (int)( ( error & 0xf ) | ( ( s - R->string ) << 4 ) );
+}
+
+/*
+	#### srx_CreateExt ####
+*/
+srx_Context* srx_CreateExt( const RX_Char* str, size_t strsize, const RX_Char* mods, int* errnpos, srx_MemFunc memfn, void* memctx )
+{
+	int flags = 0, err, cel[ RX_MAX_CAPTURES ];
+	srx_Context* R = NULL;
+	if( mods )
+	{
+		const RX_Char* modbegin = mods;
+		while( *mods )
+		{
+			switch( *mods )
+			{
+			case 'm': flags |= RCF_MULTILINE; break;
+			case 'i': flags |= RCF_CASELESS; break;
+			case 's': flags |= RCF_DOTALL; break;
+			default:
+				err = ( RXEINMOD & 0xf ) | ( (int)( mods - modbegin ) << 4 );
+				goto fail;
+			}
+			mods++;
+		}
+	}
+	
+	if( !memfn )
+		memfn = srx_DefaultMemFunc;
+	
+	R = (srx_Context*) memfn( memctx, NULL, sizeof(srx_Context) );
+	memset( R, 0, sizeof(*R) );
+	memset( cel, 0, sizeof(cel) );
+	R->memfn = memfn;
+	R->memctx = memctx;
+	R->string = str;
+	R->stringend = str + strsize;
+	R->flags = flags;
+	R->numcaps = 1;
+	
+	err = regex_real_compile( R, cel, &str, str + strsize, 0, &R->root );
+	
+	if( err )
+	{
+		memfn( memctx, R, 0 );
+		R = NULL;
+	}
+	else
+	{
+		regex_item* item = RX_ALLOC( regex_item );
+		memset( item, 0, sizeof(*item) );
+		item->type = RIT_SUBEXP;
+		item->min = 1;
+		item->max = 1;
+		item->pos = item->ch = R->root;
+		R->caps[ 0 ] = R->root = item;
+	}
+fail:
+	if( errnpos )
+	{
+		unsigned uerr = (unsigned) err;
+		errnpos[0] = (int)( uerr ? ( uerr & 0xf ) | 0xfffffff0 : 0 );
+		errnpos[1] = (int)( ( uerr & 0xfffffff0 ) >> 4 );
+	}
+	RXLOGINFO( if( R ) srx_DumpToStdout(R) );
+	return R;
+}
+
+/*
+	#### srx_Destroy ####
+*/
+int srx_Destroy( srx_Context* R )
+{
+	if( R )
+	{
+		srx_MemFunc memfn = R->memfn;
+		void* memctx = R->memctx;
+		if( R->root )
+			regex_free_item( R, R->root );
+		memfn( memctx, R, 0 );
+	}
+	return !!R;
+}
+
+
+static void regex_dump_list( regex_item* items, int lev );
+static void regex_dump_item( regex_item* item, int lev )
+{
+	const char* types[] =
+	{
+		"-", "MATCH (1)", "RANGE (2)", "SPCBEG (3)", "SPCEND (4)", "BKREF (5)", "-", "-", "-", "-",
+		"-", "EITHER (11)", "SUBEXP (12)", "-"
+	};
+	
+	int l = lev;
+	while( l --> 0 )
+		printf( "- " );
+	printf( "%s", types[ item->type ] );
+	if( item->flags & RIF_INVERT ) printf( " INV" );
+	if( item->flags & RIF_LAZY ) printf( " LAZY" );
+	switch( item->type )
+	{
+	case RIT_MATCH: printf( " char %d", (int) item->a ); break;
+	case RIT_RANGE:
+		for( l = 0; l < item->count; ++l )
+		{
+			if( l > 0 )
+				printf( "," );
+			printf( " %d - %d", (int) item->range[l*2], (int) item->range[l*2+1] );
+		}
+		break;
+	case RIT_BKREF: printf( " #%d", (int) item->a ); break;
+	}
+	printf( " (%d to %d) (0x%p => 0x%p)\n", item->min, item->max, item->matchbeg, item->matchend );
+	
+	if( item->ch )
+	{
+		regex_dump_list( item->ch, lev + 1 );
+		if( item->ch2 )
+		{
+			int l2 = lev;
+			while( l2 --> 0 )
+				printf( "- " );
+			printf( "--|\n" );
+			regex_dump_list( item->ch2, lev + 1 );
+		}
+	}
+}
+static void regex_dump_list( regex_item* items, int lev )
+{
+	while( items )
+	{
+		regex_dump_item( items, lev );
+		items = items->next;
+	}
+}
+
+/*
+	#### srx_DumpToStdout ####
+*/
+void srx_DumpToStdout( srx_Context* R )
+{
+	regex_dump_list( R->root, 0 );
+}
+
+/*
+	#### srx_Match ####
+*/
+int srx_MatchExt( srx_Context* R, const RX_Char* str, size_t size, size_t offset )
+{
+	int ret;
+	const RX_Char* strend = str + size;
+	match_ctx ctx;
+	{
+		ctx.string = str;
+		ctx.stringend = strend;
+		ctx.item = R->root;
+		ctx.R = R;
+	}
+	R->string = str;
+	if( offset > size )
+		return 0;
+	str += offset;
+	while( str < strend )
+	{
+		ret = regex_test_start( str, &ctx );
+		if( ret < 0 )
+			return 0;
+		if( ret > 0 )
+			return 1;
+		str++;
+	}
+	return 0;
+}
+
+/*
+	#### srx_GetCaptureCount ####
+*/
+int srx_GetCaptureCount( srx_Context* R )
+{
+	return R->numcaps;
+}
+
+/*
+	#### srx_GetCaptured ####
+*/
+int srx_GetCaptured( srx_Context* R, int which, size_t* pbeg, size_t* pend )
+{
+	const RX_Char* a, *b;
+	if( srx_GetCapturedPtrs( R, which, &a, &b ) )
+	{
+		if( pbeg ) *pbeg = (size_t)( a - R->string );
+		if( pend ) *pend = (size_t)( b - R->string );
+		return 1;
+	}
+	return 0;
+}
+
+/*
+	#### srx_GetCapturedPtrs ####
+*/
+int srx_GetCapturedPtrs( srx_Context* R, int which, const RX_Char** pbeg, const RX_Char** pend )
+{
+	if( which < 0 || which >= R->numcaps )
+		return 0;
+	if( R->caps[ which ] == NULL )
+		return 0;
+	if( pbeg ) *pbeg = R->caps[ which ]->matchbeg;
+	if( pend ) *pend = R->caps[ which ]->matchend;
+	return 1;
+}
+
+/*
+	#### srx_ReplaceExt ####
+*/
+RX_Char* srx_ReplaceExt( srx_Context* R, const RX_Char* str, size_t strsize, const RX_Char* rep, size_t repsize, size_t* outsize )
+{
+	RX_Char* out = "";
+	const RX_Char *from = str, *fromend = str + strsize, *repend = rep + repsize;
+	size_t size = 0, mem = 0;
+	
+#define SR_CHKSZ( szext ) \
+	if( (ptrdiff_t)( mem - size ) < (ptrdiff_t)(szext) ) \
+	{ \
+		size_t nsz = MAX( mem * 2, size + (size_t)(szext) ); \
+		RX_Char* nmem = RX_ALLOC_N( RX_Char, nsz + 1 ); \
+		if( mem ) \
+		{ \
+			memcpy( nmem, out, size + 1 ); /* copy with \0 */ \
+			RX_FREE( out ); \
+		} \
+		out = nmem; \
+		mem = nsz; \
+	}
+#define SR_ADDBUF( from, to ) \
+	SR_CHKSZ( to - from ) \
+	memcpy( out + size, from, (size_t)( to - from ) ); \
+	size += (size_t)( to - from );
+	
+	while( from < fromend )
+	{
+		const RX_Char* ofp = NULL, *ep = NULL, *rp;
+		if( !srx_MatchExt( R, from, (size_t)( fromend - from ), 0 ) )
+			break;
+		srx_GetCapturedPtrs( R, 0, &ofp, &ep );
+		SR_ADDBUF( from, ofp );
+		
+		rp = rep;
+		while( rp < repend )
+		{
+			RX_Char rc = *rp;
+			if( ( rc == '\\' || rc == '$' ) && rp + 1 < repend )
+			{
+				if( isdigit( rp[1] ) )
+				{
+					int dig = rp[1] - '0';
+					const RX_Char *brp, *erp;
+					if( srx_GetCapturedPtrs( R, dig, &brp, &erp ) )
+					{
+						SR_ADDBUF( brp, erp );
+					}
+					rp += 2;
+					continue;
+				}
+				else if( rp[1] == rc )
+				{
+					rp++;
+				}
+			}
+			SR_ADDBUF( rp, rp + 1 );
+			rp++;
+		}
+		
+		if( from == ep )
+			from++;
+		else
+			from = ep;
+	}
+	
+	SR_ADDBUF( from, fromend );
+	if( outsize )
+		*outsize = size;
+	{
+		char nul[1] = {0};
+		SR_ADDBUF( nul, &nul[1] );
+	}
+	return out;
+}
+
+/*
+	#### srx_FreeReplaced ####
+*/
+void srx_FreeReplaced( srx_Context* R, RX_Char* repstr )
+{
+	RX_FREE( repstr );
+}
+

A vendor/sgregex/regex.h

@@ -0,0 +1,70 @@ 
+
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stddef.h>
+
+
+#define RXSUCCESS 0
+#define RXEINMOD  -1 /* invalid modifier */
+#define RXEPART   -2 /* partial (sub-)expression */
+#define RXEUNEXP  -3 /* unexpected character */
+#define RXERANGE  -4 /* invalid range (min > max) */
+#define RXELIMIT  -5 /* too many digits */
+#define RXEEMPTY  -6 /* expression is effectively empty */
+#define RXENOREF  -7 /* the specified backreference cannot be used here */
+
+#define RX_ALLMODS "mis"
+
+#ifndef RX_STRLENGTHFUNC
+#define RX_STRLENGTHFUNC( str ) strlen( str )
+#endif
+
+
+typedef void* (*srx_MemFunc)
+(
+	void* /* userdata */,
+	void* /* ptr */,
+	size_t /* size */
+);
+
+#ifdef RX_NEED_DEFAULT_MEMFUNC
+static void* srx_DefaultMemFunc( void* userdata, void* ptr, size_t size )
+{
+	(void) userdata;
+	if( size )
+		return realloc( ptr, size );
+	free( ptr );
+	return NULL;
+}
+#endif
+
+typedef char RX_Char;
+
+typedef struct _srx_Context srx_Context;
+
+
+srx_Context* srx_CreateExt( const RX_Char* str, size_t strsize, const RX_Char* mods, int* errnpos, srx_MemFunc memfn, void* memctx );
+#define srx_Create( str, mods ) srx_CreateExt( str, RX_STRLENGTHFUNC(str), mods, NULL, srx_DefaultMemFunc, NULL )
+int srx_Destroy( srx_Context* R );
+void srx_DumpToStdout( srx_Context* R );
+
+int srx_MatchExt( srx_Context* R, const RX_Char* str, size_t size, size_t offset );
+#define srx_Match( R, str, off ) srx_MatchExt( R, str, RX_STRLENGTHFUNC(str), off )
+int srx_GetCaptureCount( srx_Context* R );
+int srx_GetCaptured( srx_Context* R, int which, size_t* pbeg, size_t* pend );
+int srx_GetCapturedPtrs( srx_Context* R, int which, const RX_Char** pbeg, const RX_Char** pend );
+
+RX_Char* srx_ReplaceExt( srx_Context* R, const RX_Char* str, size_t strsize, const RX_Char* rep, size_t repsize, size_t* outsize );
+#define srx_Replace( R, str, rep ) srx_ReplaceExt( R, str, RX_STRLENGTHFUNC(str), rep, RX_STRLENGTHFUNC(rep), NULL )
+void srx_FreeReplaced( srx_Context* R, RX_Char* repstr );
+
+
+#ifdef __cplusplus
+}
+#endif
+

D vendor/slre.nim

@@ -1,125 +0,0 @@ 
-#
-#  Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
-#  All rights reserved
-# 
-#  "THE BEER-WARE LICENSE" (Revision 42):
-#  Sergey Lyubka wrote this file.  As long as you retain this notice you
-#  can do whatever you want with this stuff. If we meet some day, and you think
-#  this stuff is worth it, you can buy me a beer in return.
-# 
-#
-#  This is a regular expression library that implements a subset of Perl RE.
-#  Please refer to http://slre.sourceforge.net for detailed description.
-# 
-#  Usage example (parsing HTTP request):
-# 
-#  struct slre	slre;
-#  struct cap	captures[4 + 1];  // Number of braket pairs + 1
-#  ...
-# 
-#  slre_compile(&slre,"^(GET|POST) (\S+) HTTP/(\S+?)\r\n");
-# 
-#  if (slre_match(&slre, buf, len, captures)) {
-# 	printf("Request line length: %d\n", captures[0].len);
-# 	printf("Method: %.*s\n", captures[1].len, captures[1].ptr);
-# 	printf("URI: %.*s\n", captures[2].len, captures[2].ptr);
-#  }
-# 
-#  Supported syntax:
-# 	^		Match beginning of a buffer
-# 	$		Match end of a buffer
-# 	()		Grouping and substring capturing
-# 	[...]		Match any character from set
-# 	[^...]		Match any character but ones from set
-# 	\s		Match whitespace
-# 	\S		Match non-whitespace
-# 	\d		Match decimal digit
-# 	\r		Match carriage return
-# 	\n		Match newline
-# 	+		Match one or more times (greedy)
-# 	+?		Match one or more times (non-greedy)
-# 	*		Match zero or more times (greedy)
-# 	*?		Match zero or more times (non-greedy)
-# 	?		Match zero or once
-# 	\xDD		Match byte with hex value 0xDD
-# 	\meta		Match one of the meta character: ^$().[*+?\
-# 
-
-{.compile: "slre/libslre.c".}
-#
-#  Compiled regular expression
-# 
-type 
-  slre* = object 
-    code*: array[256, cuchar]
-    data*: array[256, cuchar]
-    code_size*: cint
-    data_size*: cint
-    num_caps*: cint         # Number of bracket pairs	
-    anchored*: cint         # Must match from string start	
-    err_str*: cstring       # Error string			
-  
-#
-#  Captured substring
-# 
-type 
-  cap* = object 
-    value*: cstring           # Pointer to the substring	
-    len*: cint              # Substring length		
-
-#
-#  Compile regular expression. If success, 1 is returned.
-#  If error, 0 is returned and slre.err_str points to the error message. 
-# 
-proc slre_compile(a2: ptr slre; re: cstring): cint {.importc.}
-#
-#  Return 1 if match, 0 if no match. 
-#  If `captured_substrings' array is not NULL, then it is filled with the
-#  values of captured substrings. captured_substrings[0] element is always
-#  a full matched substring. The round bracket captures start from
-#  captured_substrings[1].
-#  It is assumed that the size of captured_substrings array is enough to
-#  hold all captures. The caller function must make sure it is! So, the
-#  array_size = number_of_round_bracket_pairs + 1
-# 
-proc slre_match(a2: ptr slre; buf: cstring; buf_len: cint; 
-                 captured_substrings: openarray[cap]): cint {.importc.}
-
-# High level API
-from strutils import contains, replace, parseInt
-from sequtils import delete
-
-proc match*(s: string, re: string): seq[string] =
-  var rawre = cast[ptr slre](alloc0(sizeof(slre)))
-  if slre_compile(rawre, re) == 1:
-    var matches:array[10, cap]
-    if rawre.slre_match(s.cstring, s.len.cint, matches) == 1:
-      var res = newSeq[string](0)
-      for i in items(matches):
-        if i.value != nil:
-          var str = $(i.value)
-          res.add str.substr(0, i.len-1)
-      return res
-    else:
-      return newSeq[string](0)
-  else:
-    raise newException(ValueError, $(rawre.err_str))
-
-proc gsub*(s_find: string, re: string, s_replace: string): string =
-  var matches = s_find.match(re)
-  if matches.len > 0:
-    var res = s_find.replace(matches[0], s_replace)
-    if matches.len > 1:
-      # Replace captures
-      var caps = res.match("\\$(\\d)")
-      if caps.len > 1:
-        # Remove first (global) match
-        caps.delete(0, 0)
-        for c in caps:
-          var ci = parseInt(c)
-          # Replace $-placeholders with captures
-          while res.contains("$"&c):
-            res = res.replace("$"&c, matches[ci])
-    return res
-  else:
-    return s_find

D vendor/slre/libslre.c

@@ -1,667 +0,0 @@ 
-/*
- * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
- * All rights reserved
- *
- * "THE BEER-WARE LICENSE" (Revision 42):
- * Sergey Lyubka wrote this file.  As long as you retain this notice you
- * can do whatever you want with this stuff. If we meet some day, and you think
- * this stuff is worth it, you can buy me a beer in return.
- */
-
-#include <stdio.h>
-#include <assert.h>
-#include <ctype.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-
-#include "slre.h"
-
-enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
-	STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
-
-static struct {
-	const char	*name;
-	int		narg;
-	const char	*flags;	
-} opcodes[] = {
-	{"END",		0, ""},		/* End of code block or program	*/
-	{"BRANCH",	2, "oo"},	/* Alternative operator, "|"	*/
-	{"ANY",		0, ""},		/* Match any character, "."	*/
-	{"EXACT",	2, "d"},	/* Match exact string		*/
-	{"ANYOF",	2, "D"},	/* Match any from set, "[]"	*/
-	{"ANYBUT",	2, "D"},	/* Match any but from set, "[^]"*/
-	{"OPEN ",	1, "i"},	/* Capture start, "("		*/
-	{"CLOSE",	1, "i"},	/* Capture end, ")"		*/
-	{"BOL",		0, ""},		/* Beginning of string, "^"	*/
-	{"EOL",		0, ""},		/* End of string, "$"		*/
-	{"STAR",	1, "o"},	/* Match zero or more times "*"	*/
-	{"PLUS",	1, "o"},	/* Match one or more times, "+"	*/
-	{"STARQ",	1, "o"},	/* Non-greedy STAR,  "*?"	*/
-	{"PLUSQ",	1, "o"},	/* Non-greedy PLUS, "+?"	*/
-	{"QUEST",	1, "o"},	/* Match zero or one time, "?"	*/
-	{"SPACE",	0, ""},		/* Match whitespace, "\s"	*/
-	{"NONSPACE",	0, ""},		/* Match non-space, "\S"	*/
-	{"DIGIT",	0, ""}		/* Match digit, "\d"		*/
-};
-
-/*
- * Commands and operands are all unsigned char (1 byte long). All code offsets
- * are relative to current address, and positive (always point forward). Data
- * offsets are absolute. Commands with operands:
- *
- * BRANCH offset1 offset2
- *	Try to match the code block that follows the BRANCH instruction
- *	(code block ends with END). If no match, try to match code block that
- *	starts at offset1. If either of these match, jump to offset2.
- *
- * EXACT data_offset data_length
- *	Try to match exact string. String is recorded in data section from
- *	data_offset, and has length data_length.
- *
- * OPEN capture_number
- * CLOSE capture_number
- *	If the user have passed 'struct cap' array for captures, OPEN
- *	records the beginning of the matched substring (cap->ptr), CLOSE
- *	sets the length (cap->len) for respective capture_number.
- *
- * STAR code_offset
- * PLUS code_offset
- * QUEST code_offset
- *	*, +, ?, respectively. Try to gobble as much as possible from the
- *	matched buffer, until code block that follows these instructions
- *	matches. When the longest possible string is matched,
- *	jump to code_offset
- *
- * STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
- */
-
-static const char *meta_chars = "|.^$*+?()[\\";
-
-static void
-print_character_set(FILE *fp, const unsigned char *p, int len)
-{
-	int	i;
-
-	for (i = 0; i < len; i++) {
-		if (i > 0)
-			(void) fputc(',', fp);
-		if (p[i] == 0) {
-			i++;
-			if (p[i] == 0)
-				(void) fprintf(fp, "\\x%02x", p[i]);
-			else
-				(void) fprintf(fp, "%s", opcodes[p[i]].name);
-		} else if (isprint(p[i])) {
-			(void) fputc(p[i], fp);
-		} else {
-			(void) fprintf(fp,"\\x%02x", p[i]);
-		}
-	}
-}
-
-void
-slre_dump(const struct slre *r, FILE *fp)
-{
-	int	i, j, ch, op, pc;
-
-	for (pc = 0; pc < r->code_size; pc++) {
-
-		op = r->code[pc];
-		(void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
-
-		for (i = 0; opcodes[op].flags[i] != '\0'; i++)
-			switch (opcodes[op].flags[i]) {
-			case 'i':
-				(void) fprintf(fp, "%d ", r->code[pc + 1]);
-				pc++;
-				break;
-			case 'o':
-				(void) fprintf(fp, "%d ",
-				    pc + r->code[pc + 1] - i);
-				pc++;
-				break;
-			case 'D':
-				print_character_set(fp, r->data +
-				    r->code[pc + 1], r->code[pc + 2]);
-				pc += 2;
-				break;
-			case 'd':
-				(void) fputc('"', fp);
-				for (j = 0; j < r->code[pc + 2]; j++) {
-					ch = r->data[r->code[pc + 1] + j];
-					if (isprint(ch))
-						(void) fputc(ch, fp);
-					else
-						(void) fprintf(fp,"\\x%02x",ch);
-				}
-				(void) fputc('"', fp);
-				pc += 2;
-				break;
-			}
-
-		(void) fputc('\n', fp);
-	}
-}
-
-static void
-set_jump_offset(struct slre *r, int pc, int offset)
-{
-	assert(offset < r->code_size);
-
-	if (r->code_size - offset > 0xff) {
-		r->err_str = "Jump offset is too big";
-	} else {
-		r->code[pc] = (unsigned char) (r->code_size - offset);
-	}
-}
-
-static void
-emit(struct slre *r, int code)
-{
-	if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
-		r->err_str = "RE is too long (code overflow)";
-	else
-		r->code[r->code_size++] = (unsigned char) code;
-}
-
-static void
-store_char_in_data(struct slre *r, int ch)
-{
-	if (r->data_size >= (int) sizeof(r->data))
-		r->err_str = "RE is too long (data overflow)";
-	else
-		r->data[r->data_size++] = ch;
-}
-
-static void
-exact(struct slre *r, const char **re)
-{
-	int	old_data_size = r->data_size;
-
-	while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
-		store_char_in_data(r, *(*re)++);
-
-	emit(r, EXACT);
-	emit(r, old_data_size);
-	emit(r, r->data_size - old_data_size);
-}
-
-static int
-get_escape_char(const char **re)
-{
-	int	res;
-
-	switch (*(*re)++) {
-	case 'n':	res = '\n';		break;
-	case 'r':	res = '\r';		break;
-	case 't':	res = '\t';		break;
-	case '0':	res = 0;		break;
-	case 'S':	res = NONSPACE << 8;	break;
-	case 's':	res = SPACE << 8;	break;
-	case 'd':	res = DIGIT << 8;	break;
-	default:	res = (*re)[-1];	break;
-	}
-
-	return (res);
-}
-
-static void
-anyof(struct slre *r, const char **re)
-{
-	int	esc, old_data_size = r->data_size, op = ANYOF;
-
-	if (**re == '^') {
-		op = ANYBUT;
-		(*re)++;
-	}
-
-	while (**re != '\0')
-
-		switch (*(*re)++) {
-		case ']':
-			emit(r, op);
-			emit(r, old_data_size);
-			emit(r, r->data_size - old_data_size);
-			return;
-			/* NOTREACHED */
-			break;
-		case '\\':
-			esc = get_escape_char(re);
-			if ((esc & 0xff) == 0) {
-				store_char_in_data(r, 0);
-				store_char_in_data(r, esc >> 8);
-			} else {
-				store_char_in_data(r, esc);
-			}
-			break;
-		default:
-			store_char_in_data(r, (*re)[-1]);
-			break;
-		}
-
-	r->err_str = "No closing ']' bracket";
-}
-
-static void
-relocate(struct slre *r, int begin, int shift)
-{
-	emit(r, END);
-	memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
-	r->code_size += shift;
-}
-
-static void
-quantifier(struct slre *r, int prev, int op)
-{
-	if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
-		r->code[prev + 2]--;
-		emit(r, EXACT);
-		emit(r, r->code[prev + 1] + r->code[prev + 2]);
-		emit(r, 1);
-		prev = r->code_size - 3;
-	}
-	relocate(r, prev, 2);
-	r->code[prev] = op;
-	set_jump_offset(r, prev + 1, prev);
-}
-
-static void
-exact_one_char(struct slre *r, int ch)
-{
-	emit(r, EXACT);
-	emit(r, r->data_size);
-	emit(r, 1);
-	store_char_in_data(r, ch);
-}
-
-static void
-fixup_branch(struct slre *r, int fixup)
-{
-	if (fixup > 0) {
-		emit(r, END);
-		set_jump_offset(r, fixup, fixup - 2);
-	}
-}
-
-static void
-compile(struct slre *r, const char **re)
-{
-	int	op, esc, branch_start, last_op, fixup, cap_no, level;
-
-	fixup = 0;
-	level = r->num_caps;
-	branch_start = last_op = r->code_size;
-
-	for (;;)
-		switch (*(*re)++) {
-		case '\0':
-			(*re)--;
-			return;
-			/* NOTREACHED */
-			break;
-		case '^':
-			emit(r, BOL);
-			break;
-		case '$':
-			emit(r, EOL);
-			break;
-		case '.':
-			last_op = r->code_size;
-			emit(r, ANY);
-			break;
-		case '[':
-			last_op = r->code_size;
-			anyof(r, re);
-			break;
-		case '\\':
-			last_op = r->code_size;
-			esc = get_escape_char(re);
-			if (esc & 0xff00) {
-				emit(r, esc >> 8);
-			} else {
-				exact_one_char(r, esc);
-			}
-			break;
-		case '(':
-			last_op = r->code_size;
-			cap_no = ++r->num_caps;
-			emit(r, OPEN);
-			emit(r, cap_no);
-
-			compile(r, re);
-			if (*(*re)++ != ')') {
-				r->err_str = "No closing bracket";
-				return;
-			}
-
-			emit(r, CLOSE);
-			emit(r, cap_no);
-			break;
-		case ')':
-			(*re)--;
-			fixup_branch(r, fixup);
-			if (level == 0) {
-				r->err_str = "Unbalanced brackets";
-				return;
-			}
-			return;
-			/* NOTREACHED */
-			break;
-		case '+':
-		case '*':
-			op = (*re)[-1] == '*' ? STAR: PLUS;
-			if (**re == '?') {
-				(*re)++;
-				op = op == STAR ? STARQ : PLUSQ;
-			}
-			quantifier(r, last_op, op);
-			break;
-		case '?':
-			quantifier(r, last_op, QUEST);
-			break;
-		case '|':
-			fixup_branch(r, fixup);
-			relocate(r, branch_start, 3);
-			r->code[branch_start] = BRANCH;
-			set_jump_offset(r, branch_start + 1, branch_start);
-			fixup = branch_start + 2;
-			r->code[fixup] = 0xff;
-			break;
-		default:
-			(*re)--;
-			last_op = r->code_size;
-			exact(r, re);
-			break;
-		}
-}
-
-int
-slre_compile(struct slre *r, const char *re)
-{
-	r->err_str = NULL;
-	r->code_size = r->data_size = r->num_caps = r->anchored = 0;
-
-	if (*re == '^')
-		r->anchored++;
-
-	emit(r, OPEN);	/* This will capture what matches full RE */
-	emit(r, 0);
-
-	while (*re != '\0')
-		compile(r, &re);
-
-	if (r->code[2] == BRANCH)
-		fixup_branch(r, 4);
-
-	emit(r, CLOSE);
-	emit(r, 0);
-	emit(r, END);
-
-	return (r->err_str == NULL ? 1 : 0);
-}
-
-static int match(const struct slre *, int,
-		const char *, int, int *, struct cap *);
-
-static void
-loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
-{
-	int	saved_offset, matched_offset;
-
-	saved_offset = matched_offset = *ofs;
-
-	while (match(r, pc + 2, s, len, ofs, NULL)) {
-		saved_offset = *ofs;
-		if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
-			matched_offset = saved_offset;
-		*ofs = saved_offset;
-	}
-
-	*ofs = matched_offset;
-}
-
-static void
-loop_non_greedy(const struct slre *r, int pc, const char *s,int len, int *ofs)
-{
-	int	saved_offset = *ofs;
-
-	while (match(r, pc + 2, s, len, ofs, NULL)) {
-		saved_offset = *ofs;
-		if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
-			break;
-	}
-
-	*ofs = saved_offset;
-}
-
-static int
-is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
-{
-	int	i, ch;
-
-	ch = s[*ofs];
-
-	for (i = 0; i < len; i++)
-		if (p[i] == ch) {
-			(*ofs)++;
-			return (1);
-		}
-
-	return (0);
-}
-
-static int
-is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
-{
-	int	i, ch;
-
-	ch = s[*ofs];
-
-	for (i = 0; i < len; i++)
-		if (p[i] == ch)
-			return (0);
-
-	(*ofs)++;
-	return (1);
-}
-
-static int
-match(const struct slre *r, int pc, const char *s, int len,
-		int *ofs, struct cap *caps)
-{
-	int	n, saved_offset, res = 1;
-
-	while (res && r->code[pc] != END) {
-
-		assert(pc < r->code_size);
-		assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
-
-		switch (r->code[pc]) {
-		case BRANCH:
-			saved_offset = *ofs;
-			res = match(r, pc + 3, s, len, ofs, caps);
-			if (res == 0) {
-				*ofs = saved_offset;
-				res = match(r, pc + r->code[pc + 1],
-				    s, len, ofs, caps);
-			}
-			pc += r->code[pc + 2]; 
-			break;
-		case EXACT:
-			res = 0;
-			n = r->code[pc + 2];	/* String length */
-			if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
-			    r->code[pc + 1], n)) {
-				(*ofs) += n;
-				res = 1;
-			}
-			pc += 3;
-			break;
-		case QUEST:
-			res = 1;
-			saved_offset = *ofs;
-			if (!match(r, pc + 2, s, len, ofs, caps))
-				*ofs = saved_offset;
-			pc += r->code[pc + 1];
-			break;
-		case STAR:
-			res = 1;
-			loop_greedy(r, pc, s, len, ofs);
-			pc += r->code[pc + 1];
-			break;
-		case STARQ:
-			res = 1;
-			loop_non_greedy(r, pc, s, len, ofs);
-			pc += r->code[pc + 1];
-			break;
-		case PLUS:
-			if ((res = match(r, pc + 2, s, len, ofs, caps)) == 0)
-				break;
-
-			loop_greedy(r, pc, s, len, ofs);
-			pc += r->code[pc + 1];
-			break;
-		case PLUSQ:
-			if ((res = match(r, pc + 2, s, len, ofs, caps)) == 0)
-				break;
-
-			loop_non_greedy(r, pc, s, len, ofs);
-			pc += r->code[pc + 1];
-			break;
-		case SPACE:
-			res = 0;
-			if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
-				(*ofs)++;
-				res = 1;
-			}
-			pc++;
-			break;
-		case NONSPACE:
-			res = 0;
-			if (*ofs <len && !isspace(((unsigned char *)s)[*ofs])) {
-				(*ofs)++;
-				res = 1;
-			}
-			pc++;
-			break;
-		case DIGIT:
-			res = 0;
-			if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
-				(*ofs)++;
-				res = 1;
-			}
-			pc++;
-			break;
-		case ANY:
-			res = 0;
-			if (*ofs < len) {
-				(*ofs)++;
-				res = 1;
-			}
-			pc++;
-			break;
-		case ANYOF:
-			res = 0;
-			if (*ofs < len)
-				res = is_any_of(r->data + r->code[pc + 1],
-					r->code[pc + 2], s, ofs);
-			pc += 3;
-			break;
-		case ANYBUT:
-			res = 0;
-			if (*ofs < len)
-				res = is_any_but(r->data + r->code[pc + 1],
-					r->code[pc + 2], s, ofs);
-			pc += 3;
-			break;
-		case BOL:
-			res = *ofs == 0 ? 1 : 0;
-			pc++;
-			break;
-		case EOL:
-			res = *ofs == len ? 1 : 0;
-			pc++;
-			break;
-		case OPEN:
-			if (caps != NULL)
-				caps[r->code[pc + 1]].ptr = s + *ofs;
-			pc += 2;
-			break;
-		case CLOSE:
-			if (caps != NULL)
-				caps[r->code[pc + 1]].len = (s + *ofs) -
-				    caps[r->code[pc + 1]].ptr;
-			pc += 2;
-			break;
-		case END:
-			pc++;
-			break;
-		default:
-			printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
-			assert(0);
-			break;
-		}
-	}
-
-	return (res);
-}
-
-int
-slre_match(const struct slre *r, const char *buf, int len,
-		struct cap *caps)
-{
-	int	i, ofs = 0, res = 0;
-
-	if (r->anchored) {
-		res = match(r, 0, buf, len, &ofs, caps);
-	} else {
-		for (i = 0; i < len && res == 0; i++) {
-			ofs = i;
-			res = match(r, 0, buf, len, &ofs, caps);
-		}
-	}
-
-	return (res);
-}
-
-#ifdef TEST
-int main(int argc, char *argv[])
-{
-	struct slre	slre;
-	struct cap	caps[20];
-	char		data[1 * 1024 * 1024];
-	FILE		*fp;
-	int		i, count, res, len;
-
-	if (argc < 3) {
-		printf("Usage: %s 'slre' <file> [count]\n", argv[0]);
-	} else if ((fp = fopen(argv[2], "rb")) == NULL) {
-		printf("Error: cannot open %s:%s\n", argv[2], strerror(errno));
-	} else if (!slre_compile(&slre, argv[1])) {
-		printf("Error compiling slre: %s\n", slre.err_str);
-	} else {
-		slre_dump(&slre, stderr);
-
-		(void) memset(caps, 0, sizeof(caps));
-
-		/* Read first 128K of file */
-		len = fread(data, 1, sizeof(data), fp);
-		(void) fclose(fp);
-
-		res = 0;
-		count = argc > 3 ? atoi(argv[3]) : 1;
-		for (i = 0; i < count; i++)
-			res = slre_match(&slre, data, len, caps);
-
-		printf("Result: %d\n", res);
-
-		for (i = 0; i < 20; i++)
-			if (caps[i].len > 0)
-				printf("Substring %d: [%.*s]\n", i,
-				    caps[i].len, caps[i].ptr);
-	}
-
-	return (0);
-}
-#endif /* TEST */

D vendor/slre/slre.h

@@ -1,92 +0,0 @@ 
-/*
- * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
- * All rights reserved
- *
- * "THE BEER-WARE LICENSE" (Revision 42):
- * Sergey Lyubka wrote this file.  As long as you retain this notice you
- * can do whatever you want with this stuff. If we meet some day, and you think
- * this stuff is worth it, you can buy me a beer in return.
- */
-
-/*
- * This is a regular expression library that implements a subset of Perl RE.
- * Please refer to http://slre.sourceforge.net for detailed description.
- *
- * Usage example (parsing HTTP request):
- *
- * struct slre	slre;
- * struct cap	captures[4 + 1];  // Number of braket pairs + 1
- * ...
- *
- * slre_compile(&slre,"^(GET|POST) (\S+) HTTP/(\S+?)\r\n");
- *
- * if (slre_match(&slre, buf, len, captures)) {
- *	printf("Request line length: %d\n", captures[0].len);
- *	printf("Method: %.*s\n", captures[1].len, captures[1].ptr);
- *	printf("URI: %.*s\n", captures[2].len, captures[2].ptr);
- * }
- *
- * Supported syntax:
- *	^		Match beginning of a buffer
- *	$		Match end of a buffer
- *	()		Grouping and substring capturing
- *	[...]		Match any character from set
- *	[^...]		Match any character but ones from set
- *	\s		Match whitespace
- *	\S		Match non-whitespace
- *	\d		Match decimal digit
- *	\r		Match carriage return
- *	\n		Match newline
- *	+		Match one or more times (greedy)
- *	+?		Match one or more times (non-greedy)
- *	*		Match zero or more times (greedy)
- *	*?		Match zero or more times (non-greedy)
- *	?		Match zero or once
- *	\xDD		Match byte with hex value 0xDD
- *	\meta		Match one of the meta character: ^$().[*+?\
- */
-
-#ifndef SLRE_HEADER_DEFINED
-#define	SLRE_HEADER_DEFINED
-
-/*
- * Compiled regular expression
- */
-struct slre {
-	unsigned char	code[256];
-	unsigned char	data[256];
-	int		code_size;
-	int		data_size;
-	int		num_caps;	/* Number of bracket pairs	*/
-	int		anchored;	/* Must match from string start	*/
-	const char	*err_str;	/* Error string			*/
-};
-
-/*
- * Captured substring
- */
-struct cap {
-	const char	*ptr;		/* Pointer to the substring	*/
-	int		len;		/* Substring length		*/
-};
-
-/*
- * Compile regular expression. If success, 1 is returned.
- * If error, 0 is returned and slre.err_str points to the error message. 
- */
-int slre_compile(struct slre *, const char *re);
-
-/*
- * Return 1 if match, 0 if no match. 
- * If `captured_substrings' array is not NULL, then it is filled with the
- * values of captured substrings. captured_substrings[0] element is always
- * a full matched substring. The round bracket captures start from
- * captured_substrings[1].
- * It is assumed that the size of captured_substrings array is enough to
- * hold all captures. The caller function must make sure it is! So, the
- * array_size = number_of_round_bracket_pairs + 1
- */
-int slre_match(const struct slre *, const char *buf, int buf_len,
-	struct cap *captured_substrings);
-
-#endif /* SLRE_HEADER_DEFINED */

all repos — min @ a0598c9364e04d2c6691d8620e379fe7e82a9562

A small but practical concatenative programming language.