From: <de...@de...> - 2007-01-26 12:41:01
|
Author: CrawfordCurrie Date: 2007-01-26 06:40:14 -0600 (Fri, 26 Jan 2007) New Revision: 12628 Added: twiki/branches/MAIN/tools/native_search/cgrep.c twiki/branches/MAIN/tools/native_search/test.pl Modified: twiki/branches/MAIN/tools/native_search/Makefile.PL twiki/branches/MAIN/tools/native_search/NativeTWikiSearch.xs Log: Item3443: recoded to use pcre instead of pcreposix (which conflicts with mod perl somehow) and improved the algorithm while I was there. Added a test program that simulates grep. Modified: twiki/branches/MAIN/tools/native_search/Makefile.PL =================================================================== --- twiki/branches/MAIN/tools/native_search/Makefile.PL 2007-01-26 10:54:20 UTC (rev 12627) +++ twiki/branches/MAIN/tools/native_search/Makefile.PL 2007-01-26 12:40:14 UTC (rev 12628) @@ -1,7 +1,8 @@ # Makefile for NativeTWikiSearch module use ExtUtils::MakeMaker; WriteMakefile( - 'NAME' => 'NativeTWikiSearch', - 'OBJECT' => 'NativeTWikiSearch.o', - 'LIBS' => [ "-lpcreposix" ], + NAME => 'NativeTWikiSearch', + OBJECT => 'NativeTWikiSearch.o cgrep.o', + LIBS => [ "-lpcre" ], + CCFLAGS => "-g", ); Modified: twiki/branches/MAIN/tools/native_search/NativeTWikiSearch.xs =================================================================== --- twiki/branches/MAIN/tools/native_search/NativeTWikiSearch.xs 2007-01-26 10:54:20 UTC (rev 12627) +++ twiki/branches/MAIN/tools/native_search/NativeTWikiSearch.xs 2007-01-26 12:40:14 UTC (rev 12628) @@ -7,198 +7,76 @@ #include "EXTERN.h" #include "perl.h" #include "XSUB.h" -#include <pcreposix.h> -#define LINEBUFSIZE 4192 -#define ERRBUFSIZE 256 -#define MATCHBUFSIZE 1 - -/* Copy the static match buffer into heap memory, resizing as required */ -char** _backup(int mc, char** m, char** r) { - int curlen = 0; - char** newR; - if (!mc) { - return r; - } - if (r) { - while (r[curlen]) { - curlen++; - } - } - newR = (char**)safemalloc(sizeof(char*) * (curlen + mc + 1)); - if (curlen) { - memcpy(newR, r, sizeof(char*) * curlen); - } - memcpy(&newR[curlen], m, sizeof(char*) * mc); - newR[curlen + mc] = (char*)NULL; - if (r) { - safefree(r); - } - - return newR; -} - -/* Do a grep. Arguments are provided in argv, options first, then the - * pattern, then the file names. -i (case insensitive) and -l (report - * matching file names only) are the only options supported. */ -char** cgrep(char** argv) { - char** argptr = argv; - int reflags = REG_NOSUB; - int justFiles = 0; - FILE* f; - regex_t pattern; - regmatch_t match; - char linebuf[LINEBUFSIZE]; - char* matchCache[MATCHBUFSIZE]; - int matchCacheSize = 0; - char** result = (char**)NULL; - int resultSize; - char* fname; - - while (*argptr) { - char* arg = *(argptr++); - if (strcmp(arg, "-i") == 0) { - reflags |= REG_ICASE; - safefree(arg); - } else if (strcmp(arg, "-l") == 0) { - justFiles = 1; - safefree(arg); - } else { - int ern; - if (ern = regcomp(&pattern, arg, reflags)) { - char erb[ERRBUFSIZE]; - regerror(ern, &pattern, erb, ERRBUFSIZE); - warn(erb); - safefree(arg); - return (char**)NULL; - } - safefree(arg); - break; - } - } - while (*argptr) { - fname = *(argptr++); - f = fopen(fname, "r"); - if (f) { - int ern; - int mi; - int size; - char ch = 0; - while (ch >= 0) { - int chc = 0; - while ((ch = fgetc(f)) >= 0) { - if (ch == '\n' || chc == LINEBUFSIZE - 1) { - break; /* got a lineful */ - } - linebuf[chc++] = ch; - } - linebuf[chc] = '\0'; - if ((ern = regexec(&pattern, linebuf, 1, &match, 0)) == 0) { - /* Successful match */ - if (matchCacheSize == MATCHBUFSIZE) { - result = _backup(matchCacheSize, matchCache, result); - matchCacheSize = 0; - } - mi = matchCacheSize++; - size = strlen(fname); - if (!justFiles) { - size += 1 + strlen(linebuf); - } - matchCache[mi] = (char*)safemalloc(size + 1); - strcpy(matchCache[mi], fname); - if (!justFiles) { - strcat(matchCache[mi], ":"); - strcat(matchCache[mi], linebuf); - /* go to next matching line in this file */ - } - if (justFiles) { - break; /* go to next file */ - } - } - } - fclose(f); - safefree(fname); - } else { - warn("Open failed"); - } - } - safefree(argv); - result = _backup(matchCacheSize, matchCache, result); - return result; -} - -/* Next two functions taken from - * http://search.cpan.org/src/TBUSCH/Lucene-0.06/Av_CharPtrPtr.cpp - * and modified +/* + * Unpack perl args into an array of (read only) strings. The function name + * is dictated by the mapping in the default typemap i.e. + * (char** -> T_PACKEDARRAY -> XS_unpack_charPtrPtr */ -char ** XS_unpack_charPtrPtr(SV* rv ) -{ +char ** XS_unpack_charPtrPtr(SV* rv) { AV *av; SV **ssv; char **s; int avlen; int x; - if( SvROK( rv ) && (SvTYPE(SvRV(rv)) == SVt_PVAV) ) + if (SvROK(rv) && (SvTYPE(SvRV(rv)) == SVt_PVAV)) av = (AV*)SvRV(rv); else { - warn("XS_unpack_charPtrPtr: rv was not an AV ref"); - return( (char**)NULL ); + warn("unpack_args: rv was not an AV ref"); + return ((char**)NULL); } /* is it empty? */ avlen = av_len(av); - if( avlen < 0 ){ - warn("XS_unpack_charPtrPtr: array was empty"); - return( (char**)NULL ); + if (avlen < 0){ + warn("unpack_args: array was empty"); + return ((char**)NULL); } /* av_len+2 == number of strings, plus 1 for an end-of-array sentinel. */ - s = (char **)safemalloc( sizeof(char*) * (avlen + 2) ); - if( s == NULL ){ - warn("XS_unpack_charPtrPtr: unable to malloc char**"); - return( (char**)NULL ); + s = (char **)malloc(sizeof(char*) * (avlen + 2)); + if (s == NULL){ + warn("unpack_args: unable to malloc char**"); + return ((char**)NULL); } - for( x = 0; x <= avlen; ++x ){ - ssv = av_fetch( av, x, 0 ); - if( ssv != NULL ){ - if( SvPOK( *ssv ) ){ - s[x] = (char *)safemalloc( SvCUR(*ssv) + 1 ); - if( s[x] == NULL ) - warn("XS_unpack_charPtrPtr: unable to malloc char*"); - else - strcpy( s[x], SvPV( *ssv, PL_na ) ); - } + for (x = 0; x <= avlen; ++x){ + s[x] = (char*)NULL; + ssv = av_fetch(av, x, 0); + if (ssv != NULL){ + s[x] = (char *)malloc( SvCUR(*ssv) + 1 ); + if (SvPOK(*ssv)) + strcpy(s[x], SvPV(*ssv, PL_na)); else - warn("XS_unpack_charPtrPtr: array elem %d was not a string.", x ); + warn("unpack_args: array elem %d was not a string.", x); } - else - s[x] = (char*)NULL; } s[x] = (char*)NULL; /* sentinel */ - return( s ); + return s; } -/* Used by the OUTPUT typemap for char**. - * Will convert a C char** to a Perl AV*, freeing the char** and the strings - * stored in it +/* + * Convert a C char** to a Perl AV*, freeing the char** and the strings + * stored in it. The function name is dictated by the mapping in the + * default typemap i.e. + * (char** -> T_PACKEDARRAY -> XS_pack_charPtrPtr */ -void XS_pack_charPtrPtr(SV* st, char **s, int n) -{ +void XS_pack_charPtrPtr(SV* st, char **s, int n) { AV *av = newAV(); SV *sv; char **c; - - for( c = s; *c != NULL; ++c ){ - sv = newSVpv( *c, 0 ); - safefree(*c); - av_push( av, sv ); + if (!s) + return; + for(c = s; *c; c++){ + sv = newSVpv(*c, 0); + av_push(av, sv); + free(*c); } - sv = newSVrv( st, NULL ); /* upgrade stack SV to an RV */ - SvREFCNT_dec( sv ); /* discard */ - SvRV( st ) = (SV*)av; /* make stack RV point at our AV */ - safefree(s); + sv = newSVrv(st, NULL); /* upgrade stack SV to an RV */ + SvREFCNT_dec(sv); /* discard */ + SvRV(st) = (SV*)av; /* make stack RV point at our AV */ + free(s); } MODULE = NativeTWikiSearch PACKAGE = NativeTWikiSearch Added: twiki/branches/MAIN/tools/native_search/cgrep.c =================================================================== --- twiki/branches/MAIN/tools/native_search/cgrep.c 2007-01-26 10:54:20 UTC (rev 12627) +++ twiki/branches/MAIN/tools/native_search/cgrep.c 2007-01-26 12:40:14 UTC (rev 12628) @@ -0,0 +1,154 @@ +/* Copyright (C) 2007 WikiRing http://wikiring.com All Rights Reserved + * Author: Crawford Currie + * Fast grep function designed for use from Perl. Does not suffer from + * limitations of `grep` viz. cost of spawning a subprocess, and + * limits on command-line length. + */ +#include <pcre.h> +#include <stdio.h> +#include <string.h> + +#define DATABUFSIZE 4192 +#define ERRBUFSIZE 256 +#define MATCHBUFSIZE 1 + +extern int errno; + +/* Copy the static match buffer into heap memory, resizing as required */ +char** _backup(int mc, char** m, char** r) { + int curlen = 0; + char** newR = NULL; + if (!mc) { + return r; + } + if (r) { + while (r[curlen]) { + curlen++; + } + newR = (char**)realloc(r, sizeof(char*) * (curlen + mc + 1)); + } + + if (!newR) { + newR = (char**)malloc(sizeof(char*) * (mc + 1)); + } + + memcpy(newR + curlen, m, sizeof(char*) * mc); + newR[curlen + mc] = NULL; + + return newR; +} + +/* Release memory used in the XS interface */ +void cleanup(char** argv) { + char** ptr = argv; + + while (*ptr) { + free(*ptr); + ptr++; + } + free(argv); +} + +/* Do a grep. Arguments are provided in argv, options first, then the + * pattern, then the file names. -i (case insensitive) and -l (report + * matching file names only) are the only options supported. */ +char** cgrep(char** argv) { + char** argptr = argv; + /* Check for UTF8 support using pcre_config */ + int erk; + int reflags = PCRE_NO_AUTO_CAPTURE; + if (pcre_config(PCRE_CONFIG_UTF8, &erk) && erk) { + reflags |= PCRE_UTF8 | PCRE_NO_UTF8_CHECK; + } + int justFiles = 0; + FILE* f; + pcre* pattern; + pcre_extra* study; + + int linebufsize = DATABUFSIZE; + char* linebuf; + char* matchCache[MATCHBUFSIZE]; + int matchCacheSize = 0; + char** result = NULL; + int resultSize; + char* fname; + const char* err; + int errPos; + + while (*argptr) { + char* arg = *(argptr++); + if (strcmp(arg, "-i") == 0) { + reflags |= PCRE_CASELESS; + } else if (strcmp(arg, "-l") == 0) { + justFiles = 1; + } else { + if (!(pattern = pcre_compile(arg, reflags, &err, &errPos, NULL))) { + warn(err); + } + if (!pattern) { + cleanup(argv); + return NULL; + } + break; + } + } + + /* Study the pattern to accelerate matching */ + study = pcre_study(pattern, 0, &err); + if (err) { + warn(err); + cleanup(argv); + return NULL; + } + + linebuf = malloc(linebufsize); + while (*argptr) { + fname = *(argptr++); + f = fopen(fname, "r"); + if (f) { + int ern; + int mi; + int size; + char ch = 0; + int ovec[30]; + int matchResult; + int chc; + while ((chc = getline(&linebuf, &linebufsize, f)) > 0) { + matchResult = pcre_exec(pattern, study, linebuf, + chc, 0, 0, ovec, 30); + if (matchResult >= 0) { + /* Successful match */ + if (matchCacheSize == MATCHBUFSIZE) { + /* Back up the cache if it's full */ + result = _backup(matchCacheSize, matchCache, result); + matchCacheSize = 0; + } + mi = matchCacheSize++; + size = strlen(fname); + if (linebuf[strlen(linebuf)-1] == '\n') { + linebuf[strlen(linebuf)-1] = '\0'; + } + if (!justFiles) { + size += 1 + strlen(linebuf); + } + matchCache[mi] = (char*)malloc(size + 1); + strcpy(matchCache[mi], fname); + if (!justFiles) { + strcat(matchCache[mi], ":"); + strcat(matchCache[mi], linebuf); + /* go to next matching line in this file */ + } else { + break; /* go to next file */ + } + } + } + fclose(f); + } else { + warn("Open failed %d", errno); + } + } + free(linebuf); + result = _backup(matchCacheSize, matchCache, result); + cleanup(argv); + return result; +} Added: twiki/branches/MAIN/tools/native_search/test.pl =================================================================== --- twiki/branches/MAIN/tools/native_search/test.pl 2007-01-26 10:54:20 UTC (rev 12627) +++ twiki/branches/MAIN/tools/native_search/test.pl 2007-01-26 12:40:14 UTC (rev 12628) @@ -0,0 +1,9 @@ +#!/usr/bin/perl +# Test program for NativeTWikiSearch +# If it is correctly installed, this program will accept parameters like grep +# e.g. +# perl test.pl -i -l NativeTWikiSearch test.pl Makefile.PL NativeTWikiSearch.xs +# +use NativeTWikiSearch; +my $result = NativeTWikiSearch::cgrep(\@ARGV); +print "RESULT\n".join("\n", @$result)."\n"; |