/************************************************************************/ /* */ /* STBM -- Self-Tuning Boyer-Moore string search */ /* */ /* Implements behoffski's variant of the Boyer-Moore search. */ /* This version was [re-?]invented after contemplating the */ /* 1977 BM paper and the 1980 Sunday paper... but before */ /* behoffski knew of the 1991 Hume and Sunday paper. */ /* */ /* There are two essential differences between this algorithm */ /* and the one used in GNU Grep: */ /* 1. GNU Grep uses the "Tuned Boyer-Moore" algorithm */ /* as suggested by Hume and Sunday's 1991 paper. */ /* This version uses a *Self*-Tuning BM algorithm. */ /* 2. GNU Grep handles case insensitivity by adding a */ /* general-purpose translation table that maps each */ /* byte before being examined by the search. As */ /* already demonstrated by Grouse Grep's state tables, */ /* this is inefficient if the translation is static, */ /* as the extra lookup per character can be eliminated */ /* by modifying the state tables to reflect the effects */ /* of the translation. */ /* */ /* So what is the Self-Tuning Boyer-Moore algorithm? Like */ /* most BM algorithms, it has a skip loop, a guard test, */ /* and a match loop. The critical difference is that we */ /* feed information from the match loop back into the guard */ /* test so that we can find the most effective guard. This */ /* dynamic feedback costs us little or nothing to implement, */ /* yet can substantially improve the search efficiency. */ /* This dynamic feedback is more effective than a static */ /* guard as we tune ourselves to the actual data, instead of */ /* relying on some guess about the likely nature of the input. */ /* */ /* The feedback is simple and cheap -- we remember the */ /* offset where the match loop detected a mismatch, and use */ /* that position as the guard! In addition, we impose a */ /* round-robin ordering on the match loop, so that each */ /* position of the search string has an equal opportunity */ /* to become the guard. We must pay the cost of setting up */ /* two loops when matching instead of just one, but this */ /* effort is more than repaid if (usually) we find a good */ /* guard. To some extent, the performance of this code will */ /* depend on the characteristics of the code generated by */ /* the compiler. This initial version was written and */ /* evaluated using GCC. */ /* */ /* Note also that the correct probability function for the */ /* guard is conditional on the success of the BM skip */ /* search -- so, for example, when searching for "tree" */ /* in /usr/dict/words, the best guard character (the */ /* character least likely to match) -- given that the skip */ /* loop has matched an "e" -- is in fact the preceding "e", */ /* despite it being the most common letter in the file. */ /* Most static guess algorithms don't take this conditional */ /* probability into account: The self-tuning BM search gets */ /* it right as all the operations on the guard that affect */ /* its mismatch efficiency occur in the context where the */ /* last character has already matched. */ /* */ /* References: */ /* "A Fast String Searching Algorithm" */ /* Robert S. Boyer & J Strother Moore */ /* Communications of the ACM */ /* Vol. 20 No. 10, October 1977 pp. 762-772 */ /* */ /* "A Very Fast Substring Search Algorithm" */ /* Daniel M. Sunday */ /* Communications of the ACM */ /* Vol. 33 No. 8, August 1990, pp. 132-142 */ /* */ /* "Fast String Searching" */ /* Andrew Hume & Daniel Sunday */ /* Software - Practice & Experience */ /* Vol. 21(11), 1221-1248 (November 1991) */ /* */ /* Copyright (C) 1998-2000 Grouse Software. All rights reserved. */ /* Written for Grouse by behoffski (Brenton Hoff). */ /* */ /* Free software: no warranty; use anywhere is ok; spread the */ /* sources; note any mods; share variations and derivatives */ /* (including sending to behoffski@grouse.com.au). */ /* */ /************************************************************************/ #include "ascii.h" #include <compdef.h> #include "ctype.h" #include "platform.h" #include "stbm.h" #include <stdio.h> #include <stdlib.h> /*Macro to provide very-low-cost case folding to lower case*/ #define STBM_TOLOWER(c) (gSTBM.CaseFold[c]) struct STBM_SearchSpec_Struct { /*Lookup table for "fast" loop*/ UINT SkipTable[STBM_ALPHABET_SIZE]; UINT md2; /*Original pattern, needed to complete match*/ UINT PatLen; BYTE *pPattern; LWORD Flags; /*Skip table for where guard fails to match*/ /* ?? not sure if it's worth doing this*/ }; typedef struct { /*Streamlined lowercase-folding table*/ BYTE CaseFold[256]; } STBM_MODULE_CONTEXT; module_scope STBM_MODULE_CONTEXT gSTBM; /************************************************************************/ /* */ /* Init -- Prepare module for operation */ /* */ /************************************************************************/ public_scope void STBM_Init(void) { UINT i; /*Initialise low-cost case folding table*/ for (i = 0; i < 256; i++) { gSTBM.CaseFold[(BYTE) i] = tolower((BYTE) i); } } /*Init*/ /************************************************************************/ /* */ /* Compile -- Analyse pattern and prepare for search */ /* */ /* Converts the pattern string into a format optimal for */ /* the Search function, and returns the compiled format */ /* as a reference to use in subsequent searches. */ /* */ /* The caller must maintain the pattern string provided here */ /* for the duration of the search, as this function does not */ /* bother to take its own copy. Even worse, if the search */ /* is case insensitive, the pattern string will be edited to */ /* make the case of all letters consistent. If necessary, */ /* copy the string to a separate buffer if either of these */ /* behaviours conflicts with other parts of the program. */ /* */ /* Returns FALSE if unable to handle pattern (probably */ /* due to insufficient memory). */ /* */ /************************************************************************/ public_scope BOOL STBM_Compile(UINT PatLen, CHAR *pPattern, LWORD Flags, STBM_SearchSpec **ppSpec) { STBM_SearchSpec *pSpec = (STBM_SearchSpec *) NIL; BYTE *pPat; BYTE *pPatEnd; UINT i; /*Destroy return value to reduce chances of being misinterpreted*/ *ppSpec = (STBM_SearchSpec *) NIL; pSpec = (STBM_SearchSpec *) Platform_SmallMalloc(sizeof(*pSpec)); if (pSpec == NULL) { /*Sorry, unable to allocate memory for compiled format*/ return FALSE; } /*Record the original pattern in the specification*/ pSpec->pPattern = (BYTE *) pPattern; pSpec->PatLen = PatLen; pSpec->Flags = Flags; /*Set up skip table in the usual Tuned-BM fashion*/ pPat = (BYTE *) pPattern; pPatEnd = pPat + PatLen - 1; for (i = 0; i < DIM(pSpec->SkipTable); i++) { pSpec->SkipTable[i] = PatLen; } if ((Flags & STBM_SEARCH_CASE_INSENSITIVE) == 0) { /*Exact match required (case sensitive)*/ for (; pPat < pPatEnd; pPat++) { pSpec->SkipTable[*pPat] = pPatEnd - pPat; } pSpec->md2 = pSpec->SkipTable[*pPat]; pSpec->SkipTable[*pPat] = 0; } else { /*Case-insensitive search -- set up table accordingly*/ for (; pPat < pPatEnd; pPat++) { /*Force supplied pattern to have consistent case*/ *pPat = tolower(*pPat); pSpec->SkipTable[*pPat] = pPatEnd - pPat; pSpec->SkipTable[toupper(*pPat)] = pPatEnd - pPat; } *pPat = tolower(*pPat); pSpec->md2 = pSpec->SkipTable[*pPat]; pSpec->SkipTable[*pPat] = 0; pSpec->SkipTable[toupper(*pPat)] = 0; } /*Prepared for search successfully*/ *ppSpec = pSpec; return TRUE; } /*Compile*/ /************************************************************************/ /* */ /* Search -- Report first occurrence of pattern in buffer */ /* */ /************************************************************************/ public_scope CHAR * STBM_Search(STBM_SearchSpec *pSpec, CHAR *pBuffer, UINT BufferLength) { UINT *pSkip; BYTE *pText; UINT Skip; BYTE *pEnd; INT GuardOffset; BYTE *pPattern = pSpec->pPattern; BYTE *pPatEnd; BYTE GuardChar; INT i; INT NegLen; /*Is the buffer too short to hold the target string?*/ if (BufferLength < pSpec->PatLen) { /*Yes, can't match*/ return NULL; } /*Prepare for search*/ pEnd = (BYTE *) (pBuffer + BufferLength); pSkip = &pSpec->SkipTable[0]; pText = (BYTE *) pBuffer; pText += (pSpec->PatLen - 1); pPatEnd = pPattern + pSpec->PatLen - 1; NegLen = -pSpec->PatLen; GuardOffset = -1; if (pSpec->PatLen == 1) { GuardOffset = 0; } GuardChar = pPatEnd[GuardOffset]; goto SkipLoop; CheckGuard: if (pText >= pEnd) return NULL; if (pText[GuardOffset] == GuardChar) goto TryMatch; pText += pSpec->md2; /* FALLTHROUGH */ /*Main search is in "fast" skip loop*/ SkipLoop: for (;;) { /*Note: When (re-)entering skip loop, checks more often*/ Skip = pSkip[*pText]; if (Skip == 0) goto CheckGuard; pText += Skip; pText += pSkip[*pText]; pText += pSkip[*pText]; Skip = pSkip[*pText]; if (Skip == 0) goto CheckGuard; pText += Skip; pText += pSkip[*pText]; pText += pSkip[*pText]; /*May need end-of-buffer check here in some applications*/ /*if (pText >= pEnd) return NULL;*/ } TryMatch: /*Check pattern from (Guard-1)..0*/ for (i = GuardOffset - 1; i > NegLen; i--) { if (pPatEnd[i] != pText[i]) goto Mismatch; } /*Now check pattern from (PatLen - 1) to (Guard + 1)*/ for (i = -1; i > GuardOffset; i--) { if (pPatEnd[i] != pText[i]) goto Mismatch; } /*?? Stash current guard position to use for next search*/ /*Match found: return pointer to start of text in buffer*/ return pText + NegLen + 1; Mismatch: /*Use last mismatch as new guard character*/ GuardChar = pPatEnd[i]; GuardOffset = i; pText += pSpec->md2; goto SkipLoop; } /*Search*/ /************************************************************************/ /* */ /* SearchCI -- Search for pattern in buffer (case insensitive) */ /* */ /************************************************************************/ public_scope CHAR * STBM_SearchCI(STBM_SearchSpec *pSpec, CHAR *pBuffer, UINT BufferLength) { UINT *pSkip; BYTE *pText; UINT Skip; BYTE *pEnd; INT GuardOffset; BYTE GuardChar; BYTE *pPattern = pSpec->pPattern; BYTE *pPatEnd; INT i; INT NegLen; /*Prepare for search*/ pEnd = (BYTE *) (pBuffer + BufferLength); pSkip = &pSpec->SkipTable[0]; pText = (BYTE *) pBuffer; pPatEnd = pPattern + pSpec->PatLen - 1; NegLen = -pSpec->PatLen; GuardOffset = -1; if (pSpec->PatLen == 1) { GuardOffset = 0; } GuardChar = STBM_TOLOWER(pPatEnd[GuardOffset]); goto SkipLoop; CheckGuard: if (pText >= pEnd) return NULL; if (STBM_TOLOWER(pText[GuardOffset]) == GuardChar) goto TryMatch; pText += pSpec->md2; /* FALLTHROUGH */ /*Main search is in "fast" skip loop*/ SkipLoop: for (;;) { /*Note: When (re-)entering skip loop, checks more often*/ Skip = pSkip[*pText]; if (Skip == 0) goto CheckGuard; pText += Skip; pText += pSkip[*pText]; pText += pSkip[*pText]; Skip = pSkip[*pText]; if (Skip == 0) goto CheckGuard; pText += Skip; pText += pSkip[*pText]; pText += pSkip[*pText]; /*May need end-of-buffer check here in some applications*/ /*if (pText >= pEnd) return NULL;*/ } TryMatch: /*Check pattern from (Guard-1)..0*/ for (i = GuardOffset - 1; i > NegLen; i--) { if (pPatEnd[i] != STBM_TOLOWER(pText[i])) goto Mismatch; } /*Now check pattern from (PatLen - 1) to (Guard + 1)*/ for (i = -1; i > GuardOffset; i--) { if (pPatEnd[i] != STBM_TOLOWER(pText[i])) goto Mismatch; } /*?? Stash current guard position to use for next search*/ /*Match found: return pointer to start of text in buffer*/ return pText + NegLen + 1; Mismatch: /*Use last mismatch as new guard character*/ GuardChar = STBM_TOLOWER(pPatEnd[i]); GuardOffset = i; pText += pSpec->md2; goto SkipLoop; } /*SearchCI*/ /************************************************************************/ /* */ /* SearchTBM -- Search using Tuned BM (not self-tuned BM) */ /* */ /* This routine is provided merely for reference/comparison. */ /* */ /* It allows the Tuned Boyer-Moore algorithm to be used to */ /* perform fixed-string searches so that direct comparisons */ /* with behoffski's self-tuned version can be made. */ /* Otherwise, it's too hard to separate algorithm performance */ /* from the performance of the program control. */ /* */ /* The variant of the Tuned BM search implemented here is */ /* based on the version implemented by GNU Grep. In */ /* particular, the static guard is always the second-to-last */ /* character of the string. */ /* */ /************************************************************************/ public_scope CHAR * STBM_SearchTBM(STBM_SearchSpec *pSpec, CHAR *pBuffer, UINT BufferLength) { UINT *pSkip; BYTE *pText; UINT Skip; BYTE *pEnd; BYTE *pPattern = pSpec->pPattern; BYTE *pPatEnd; BYTE GuardChar; INT i; INT NegLen; /*Is the buffer too short to hold the target string?*/ if (BufferLength < pSpec->PatLen) { /*Yes, can't match*/ return NULL; } /*Prepare for search*/ pEnd = (BYTE *) (pBuffer + BufferLength); pSkip = &pSpec->SkipTable[0]; pText = (BYTE *) pBuffer; pText += (pSpec->PatLen - 1); pPatEnd = pPattern + pSpec->PatLen - 1; NegLen = -pSpec->PatLen; GuardChar = pPatEnd[-1]; goto SkipLoop; CheckGuard: if (pText >= pEnd) return NULL; if (pText[-1] == GuardChar) goto TryMatch; pText += pSpec->md2; /* FALLTHROUGH */ /*Main search is in "fast" skip loop*/ SkipLoop: for (;;) { /*Note: When (re-)entering skip loop, checks more often*/ Skip = pSkip[*pText]; if (Skip == 0) goto CheckGuard; pText += Skip; pText += pSkip[*pText]; pText += pSkip[*pText]; Skip = pSkip[*pText]; if (Skip == 0) goto CheckGuard; pText += Skip; pText += pSkip[*pText]; pText += pSkip[*pText]; /*May need end-of-buffer check here in some applications*/ /*if (pText >= pEnd) return NULL;*/ } TryMatch: /*Check pattern from (Guard-1)..0*/ for (i = -2; i > NegLen; i--) { if (pPatEnd[i] != pText[i]) goto Mismatch; } /*Match found: return pointer to start of text in buffer*/ return pText + NegLen + 1; Mismatch: /*Use last mismatch as new guard character*/ pText += pSpec->md2; goto SkipLoop; } /*SearchTBM*/ /************************************************************************/ /* */ /* GetPattern -- Retrieve originally-supplied pattern pointer */ /* */ /* In many cases, the client may malloc the text, and then hand */ /* the text to STBM to be compiled. It's possible to have a */ /* memory leak if the pattern is destroyed without freeing */ /* the text. However, it's mighty inconvenient to carry around */ /* the text pointer in parallel to the STBM version, especially */ /* as STBM stores the pointer anyway. */ /* */ /* So this function allows the client to retrieve the text */ /* pointer stored by STBM so that the memory may be freed. */ /* */ /************************************************************************/ public_scope void STBM_GetPattern(STBM_SearchSpec *pSpec, CHAR **ppPatternText) { /*Write pattern pointer to caller*/ *ppPatternText = pSpec->pPattern; } /*GetPattern*/ /************************************************************************/ /* */ /* Destroy -- Get rid of optimised search spec */ /* */ /* Discards the resources acquired when the search pattern */ /* was compiled. Note the text containing the original */ /* pattern might need freeing also; this is not done here. */ /* See GetPattern if you need to recover the pointer for */ /* freeing. */ /* */ /************************************************************************/ public_scope void STBM_Destroy(STBM_SearchSpec *pSpec) { /*Free memory acquired to hold search spec*/ Platform_SmallFree(pSpec); } /*Destroy*/