/************************************************************************/ /* */ /* ScanFile -- Manage RE search for a single file */ /* */ /* This module accepts the regular expression to be used */ /* plus various search and display options, and performs */ /* the requested search on each file given. It uses FastFile */ /* to bring the file into a memory buffer in a line-oriented */ /* fashion, and RETable and/or the self-tuned Boyer-Moore */ /* search algorithms to implement the search on each buffer. */ /* */ /* The search effort is partitioned into a fast file scan */ /* search and a slower match portion. This division is */ /* made so that the file may be searched with the least */ /* effort. However, the decision on what's appropriate */ /* as the scan RE is partially dependent on the nature of */ /* the file being searched: we may perform fairly poorly */ /* if our guesses are wrong. */ /* */ /* File offsets are handled by 32-bit integers, which is */ /* inadequate for really big files. */ /* */ /* The whole structure of this module, which in some */ /* distorted way extends to include matcheng.h, is a bit of */ /* a mess. It needs to be split into smaller and more */ /* coherent pieces, but exactly how isn't clear. */ /* */ /* Another problem is that while almost everything else in */ /* ggrep is reentrant, this module most certainly isn't. */ /* */ /* Copyright (C) Grouse Software 1995-2000. All rights reserved. */ /* Written for Grouse by behoffski (Brenton Hoff). */ /* */ /* Free software: no warranty; use anywhere is ok; spread the */ /* sources; note any mods; share variations and derivatives */ /* (including sending to behoffski@grouse.com.au). */ /* */ /************************************************************************/ #include "ascii.h" #include <compdef.h> #include <dirent.h> #include <errno.h> #include "fastfile.h" #include "main.h" #include "matcheng.h" #include "memrchr.h" #include <memory.h> #include "platform.h" #include "retable.h" #include "scanfile.h" #include "stbm.h" #include "stbmshim.h" #include <stdio.h> #include <sys/types.h> #include "tbldisp.h" #include "tracery.h" #include <stdarg.h> /*Parameters for buffering file into lines*/ #define FILE_BUFFER_SIZE (4096uL * 14) #define BYTES_BEFORE_BUFFER 8 #define BYTES_AFTER_BUFFER (64 + 4) #define SCANFILE_DIR_NAME_SIZE_DEFAULT 16384 /*Note: BYTES_AFTER_BUFFER must be >= BOYER_MOORE_LOOKAHEAD_MAX*/ /*Use behoffski's favourite byte value as an endmarker*/ #define SCANFILE_ENDMARKER_DEFAULT 0xee /*File stats plus parent pointer so we may search for recursion loops*/ typedef struct { void *pParent; struct stat stat; } ScanFile_Stats; typedef BOOL (FILE_SCANNER)(void); typedef struct { /*------------Variables controlling matching each buffer-----------*/ /*Tracery control block for this module*/ Tracery_ObjectInfo TraceInfo; /*Function+context for fast buffer scanning*/ MatchEng_MatchFunction pScan; MatchEng_Spec *pScanContext; /*Search/match context shared between modules (used by fast scan)*/ MatchEng_Details Details; /*Function+context for completing matching once scan text found*/ MatchEng_MatchFunction pMatch; MatchEng_Spec *pMatchContext; /*Duplicate context used by slower match attempts*/ MatchEng_Details Details2; /*Function to handle lines selected by search*/ MatchEng_SelectFunction *pSelect; /*Match sense -- line selection may be inverted by caller*/ BOOL SelectMatchingLines; /*Flags recording if any lines matched overall and for current file*/ BOOL MatchedAny; /*Flag indicating whether inverted blocks need to be unpacked*/ BOOL UnpackBlocks; /*Flag indicating whether to recurse directories*/ BOOL RecurseDir; /*Flag naming if we want to find the line start*/ BOOL FindLineStart; /*------------------File buffer conditioning---------------*/ /*FastFile file handle*/ FastFile_Context *pHandle; /*Variables for conditioning start of memory buffer*/ CHAR PrecedingLF; /*Memory specifying bytes after buffer to optimise search*/ UINT EndLength; CHAR EndBytes[BYTES_AFTER_BUFFER]; /*------------Treely-ruly-module-related variables--------------*/ /*Platform-specific functions to display matches*/ MatchEng_SelectFunction *pNormalOut; MatchEng_SelectFunction *pHighlightOut; MatchEng_SelectFunction *pFilenameOut; /*RE match function provided by client*/ MatchEng_MatchFunction pExternMatchFunc; /*Debugging options*/ LWORD Debug; } SCANFILE_MODULE_CONTEXT; module_scope SCANFILE_MODULE_CONTEXT gScanFile; /*Extra information for Tracery operation*/ #ifdef TRACERY_ENABLED #define TRACERY_MODULE_INFO (gScanFile.TraceInfo) /*Debugging/tracing flags*/ #define SCANFILE_T_BUFFER BIT0 #define SCANFILE_T_SCAN BIT1 #define SCANFILE_T_MATCH BIT2 #define SCANFILE_T_DIR BIT3 module_scope Tracery_EditEntry gScanFile_TraceryEditDefs[] = { {"B", SCANFILE_T_BUFFER, SCANFILE_T_BUFFER, "Trace buffer"}, {"b", SCANFILE_T_BUFFER, 0x00, "Ignore buffer"}, {"S", SCANFILE_T_SCAN, SCANFILE_T_SCAN, "Trace scanner"}, {"s", SCANFILE_T_SCAN, 0x00, "Ignore scanner"}, {"M", SCANFILE_T_MATCH, SCANFILE_T_MATCH, "Trace matcher"}, {"m", SCANFILE_T_MATCH, 0x00, "Ignore matcher"}, {"D", SCANFILE_T_DIR, SCANFILE_T_DIR, "Trace directory"}, {"d", SCANFILE_T_DIR, 0x00, "Ignore directory"}, TRACERY_EDIT_LIST_END }; #endif /*TRACERY_ENABLED*/ /************************************************************************/ /* */ /* Start -- Begin managing what has to be managed */ /* */ /************************************************************************/ public_scope BOOL ScanFile_Start(void) { /*Make sure FastFile starts first*/ if (! FastFile_Start(FILE_BUFFER_SIZE)) { return FALSE; } return TRUE; } /*Start*/ /************************************************************************/ /* */ /* NewScanContext -- Prepare blank scan context block */ /* */ /* Typically we get out scan context from RETable, as we set */ /* to use the table-driven architecture. However, in some */ /* cases we use an alterative scan engine (e.g. STBM). */ /* This function provides a basic scan context block for */ /* alternate searches to use. */ /* */ /* The whole implementation of scan context is rather klunky */ /* and would benefit from a careful restructuring. */ /* */ /************************************************************************/ module_scope BOOL ScanFile_NewScanContext(MatchEng_Spec **ppScanContext) { MatchEng_Spec *pScanContext; /*Destroy return arguments to reduce chance of being misunderstood*/ *ppScanContext = (MatchEng_Spec *) NULL; /*Acquire memory to store context*/ pScanContext = (MatchEng_Spec *) Platform_SmallMalloc(sizeof(*pScanContext)); if (pScanContext == NULL) { /*Sorry, no memory available*/ return FALSE; } /*Okay, set up reasonable defaults for context block*/ /* ?? */ memset(pScanContext, 0, sizeof(*pScanContext)); /*Created context, write to caller and report success*/ *ppScanContext = pScanContext; return TRUE; } /*NewScanContext*/ /************************************************************************/ /* */ /* MatchedAbandon -- Halt search if matching line found */ /* */ /* This function is used for the -L search option. */ /* */ /************************************************************************/ module_scope BOOL ScanFile_MatchedAbandon(MatchEng_Details *pDetails) { /*Search can abandon current file*/ return FALSE; } /*MatchedAbandon*/ /************************************************************************/ /* */ /* Open -- Prepare file for scanning */ /* */ /************************************************************************/ module_scope BOOL ScanFile_Open(CHAR *pFilename, struct stat *pStat) { BOOL FirstFile; BOOL Opened; /*Is this the second or later file to be searched?*/ FirstFile = TRUE; if (gScanFile.pHandle != NULL) { /*Yes, remember this for later*/ FirstFile = FALSE; } /*Is this the first file being searched?*/ if (FirstFile) { /*Yes, open a new FastFile handle*/ Opened = FastFile_Open(pFilename, FILE_BUFFER_SIZE, FASTFILE_P_MODE_LINE, pStat, &gScanFile.pHandle); } else { /*No, reuse existing handle -- it's faster*/ Opened = FastFile_Reopen(gScanFile.pHandle, pFilename, pStat); } /*Did we succeed?*/ if (! Opened) { /*No, unable to open file*/ fprintf(stderr, "%s: %s: %s\n", Platform_ProgramName(), pFilename, strerror(errno)); /*Record problem for exit value reporting*/ Main_ReturnCode(MAIN_RETURN_FAULT); /*Skip to next file to process*/ return FALSE; } /*Is the filename anything other than stdin?*/ if (pFilename != NULL) { /*Yes, record filename for reporting*/ gScanFile.Details.pFilename = pFilename; } else { /*Standard input -- plug in name ourselves*/ gScanFile.Details.pFilename = "(standard input)"; } /*Is this the second or later file?*/ if (! FirstFile) { /*Yes, handle is already configured, so we're done*/ return TRUE; } /*Configure FastFile to reserve space for LF before buffer*/ if (! FastFile_StartCondition(gScanFile.pHandle, BYTES_BEFORE_BUFFER, 1, &gScanFile.PrecedingLF)) { /*Error configuring buffer: not enough memory, perhaps?*/ fprintf(stderr, "%s: Unable to condition start\n", Platform_ProgramName()); Main_ReturnCode(MAIN_RETURN_FAULT); return FALSE; } /*Configure FastFile to prepare end of buffer*/ switch (gScanFile.pScanContext->EndCondition) { case MATCHENG_CONDITION_TRAILING_LITERAL: /*Add literal to simplify memory search specification*/ gScanFile.EndBytes[0] = LF; memset(&gScanFile.EndBytes[1], gScanFile.pScanContext->TrailingLiteral, gScanFile.pScanContext->PatternLength); gScanFile.EndLength = gScanFile.pScanContext->PatternLength + 1; break; default: gScanFile.EndBytes[0] = LF; gScanFile.EndLength = 1; break; } if (! FastFile_EndCondition(gScanFile.pHandle, BYTES_AFTER_BUFFER, gScanFile.EndLength, gScanFile.EndBytes)) { /*Error configuring buffer: not enough memory, perhaps?*/ fprintf(stderr, "%s: Unable to condition end\n", Platform_ProgramName()); Main_ReturnCode(MAIN_RETURN_FAULT); return FALSE; } /*Opened successfully*/ return TRUE; } /*Open*/ /************************************************************************/ /* */ /* ExpandNames -- Build a list of all files in a directory */ /* */ /* Prepares a list of all files in the specified directory, */ /* with a NUL terminating each name and consecutive NULs */ /* (a zero-length string) marking the end of the list. No */ /* attempt is made to sort names into alphabetical order. */ /* */ /* Returns the first name in the list, or NULL if the function */ /* was unable to build the list successfully. The list is */ /* allocated out of the heap, so if a pointer is returned, */ /* the caller must free the memory to avoid memory leaks. */ /* */ /* We expect that the caller will want to prepend the */ /* directory name, and possibly a trailing slash, to the */ /* file, so we add space at the start of the list to allow */ /* for this case, and report the start of the prepended area */ /* as our return value. The caller must add */ /* strlen(pDirname) + 1 bytes to the returned pointer to find */ /* the first name. We return this earlier pointer so that */ /* the caller can free the memory block correctly. */ /* */ /************************************************************************/ module_scope CHAR * ScanFile_ExpandNames(CHAR *pDirname) { DIR *pDir; struct dirent *pEntry; UINT BlockSize; CHAR *pMem; CHAR *pBiggerMem; CHAR *pFile; UINT NameSize; BlockSize = SCANFILE_DIR_NAME_SIZE_DEFAULT; /*Open the directory for enumeration*/ pDir = opendir(pDirname); if (pDir == NULL) { /*Failed to access directory*/ return NULL; } /*Acquire initial space to store names*/ pMem = malloc(BlockSize); if (pMem == NULL) { /*Sorry, unable to acquire space to store names*/ closedir(pDir); return FALSE; } /*Add an offset to allow directory name to be prepended*/ pFile = pMem + strlen(pDirname) + 1; /*Work through each entry in the directory*/ for (;;) { /*Read next entry of the file, if any*/ pEntry = readdir(pDir); if (pEntry == NULL) { /*Finished enumerating files*/ break; } /*Skip "." and ".." entries if found*/ if ((strcmp(pEntry->d_name, ".") == 0) || (strcmp(pEntry->d_name, "..") == 0)) { continue; } /*While the name wouldn't fit into the memory block...*/ NameSize = strlen(pEntry->d_name); while ((pFile + NameSize + 2) >= (pMem + BlockSize)) { /*...Allocate a larger block*/ BlockSize *= 2; pBiggerMem = realloc(pMem, BlockSize); if (pBiggerMem == NULL) { /*Sorry, ran out of memory to store names*/ free(pMem); return FALSE; } /*Change pointers to use newly-acquired space*/ pFile = pBiggerMem + (pFile - pMem); pMem = pBiggerMem; } /*Add the name to the block*/ memcpy(pFile, pEntry->d_name, NameSize + 1); pFile += NameSize + 1; } /*Terminate the list with a 0-length entry*/ *pFile++ = NUL; /*Okay, report results to caller*/ return pMem; } /*ExpandNames*/ /************************************************************************/ /* */ /* RecurseDir -- Enumerate and search files in directory */ /* */ /* Finds the names of all the files in the specified directory, */ /* and executes the search on each file found. This routine */ /* is modelled closely on the directory recursion facility */ /* in GNU Grep, including checking for circular references */ /* in the directory heirarchy. */ /* */ /* Returns FALSE if multi-file searches are to be skipped. */ /* */ /************************************************************************/ module_scope BOOL ScanFile_RecurseDir(CHAR *pDirname, ScanFile_Stats *pStats) { ScanFile_Stats *pSearch; CHAR *pNames; CHAR *pFile; UINT SlashSpace; UINT DirLen; TRACERY(SCANFILE_T_DIR, { printf("ScanFile_RecurseDir(%s, ...)\n", pDirname); }); /*Loop through all parent directories of this one*/ for (pSearch = pStats->pParent; pSearch != NULL; pSearch = pSearch->pParent) { /*Does this predecessor match this directory?*/ if ((pSearch->stat.st_ino == pStats->stat.st_ino) && (pSearch->stat.st_dev == pStats->stat.st_dev)) { /*Yes, we've detected a loop: Abandon this directory*/ return TRUE; } } /*Okay, we haven't encountered a loop*/ /*Expand the directory into a list of names*/ pNames = ScanFile_ExpandNames(pDirname); if (pNames == NULL) { /*Sorry, no memory to list files of this directory*/ return FALSE; } /*Prepare to prepend name (and optional slash) to each filename*/ SlashSpace = 0; DirLen = strlen(pDirname); if (pDirname[DirLen - 1] != '/') { SlashSpace = 1; } /*Loop through each file in the expanded list*/ for (pFile = pNames + DirLen + 1; *pFile != '\0'; pFile += strlen(pFile) + 1) { /*Prepend the directory name to the filename*/ memcpy(pFile - DirLen - SlashSpace, pDirname, DirLen); if (SlashSpace == 1) { pFile[-1] = '/'; } /*Okay, search the complete path*/ if (! ScanFile_Search(pFile - DirLen - SlashSpace, pStats)) { /*Search isn't interested in any more files*/ free(pNames); return FALSE; } } /*Free space acquired for directory names and report success*/ free(pNames); return TRUE; } /*RecurseDir*/ /************************************************************************/ /* */ /* DisplayBlock -- Display block of lines (for inverted match) */ /* */ /* Displays a block of lines up to but not including the */ /* matching line specified in pDetails. Also calculates */ /* match counts and buffer line counts if these details are */ /* to be reported. */ /* */ /* The function returns TRUE if the file scan may continue, */ /* and returns FALSE if we're no longer interested in the */ /* remainder of the file. */ /* */ /************************************************************************/ public_scope BOOL ScanFile_DisplayBlock(MatchEng_Details *pDetails, BYTE *pBlockStart, BYTE *pBlockEnd) { BYTE *pNextLF; TRACERY(SCANFILE_T_MATCH, { printf("\nScanfile_DisplayBlock(%p, %p (%d))\n", pBlockStart, pBlockEnd, pBlockEnd - pBlockStart); }); /*Is there any data to report?*/ if (pBlockEnd != pBlockStart) { /*Yes, note that lines were matched*/ gScanFile.MatchedAny = TRUE; } /*Do we need to do any line-by-line analysis or reporting?*/ if (! gScanFile.UnpackBlocks) { /*No, merely dump entire block to the output*/ fwrite(pBlockStart, 1, pBlockEnd - pBlockStart, stdout); /*Finished reporting*/ return TRUE; } /*Loop through each line of the block*/ while (pBlockStart < pBlockEnd) { /*Split block into lines at each LF and report/count lines*/ TRACERY(SCANFILE_T_MATCH, { printf("\nmemchr(%p, LF, %u)", pBlockStart, pBlockEnd - pBlockStart); }); gScanFile.Details.LineMatchCount++; /*Look for next line separator*/ pNextLF = memchr(pBlockStart, LF, pBlockEnd - pBlockStart); /*Did we find a separator?*/ if (pNextLF == NULL) { /*No, block ends without LF*/ pNextLF = pBlockEnd - 1; } /*Do we have a function to report the line?*/ if (gScanFile.pSelect != NULLFUNC) { /*Yes, fill in details and report line*/ gScanFile.Details.pLineStart = pBlockStart; gScanFile.Details.pMatchStart = pBlockStart; gScanFile.Details.pLineEnd = pNextLF + 1; gScanFile.Details.pMatchEnd = pNextLF; if (! gScanFile.pSelect(&gScanFile.Details)) { /*We may abandon this file*/ return FALSE; } /*Line number*/ gScanFile.Details.LineNr++; } pBlockStart = pNextLF + 1; } return TRUE; } /*DisplayBlock*/ /************************************************************************/ /* */ /* SearchBuffer -- Search one buffer of file */ /* */ /************************************************************************/ module_scope BOOL ScanFile_SearchBuffer(BYTE *pInBuf) { BYTE *pBufCurr; BOOL Found; UINT32 BufferLines; /*Start scan at first byte of buffer*/ pBufCurr = pInBuf; BufferLines = 0; /*Loop through buffer, looking for RE matches*/ for (;;) { /*Search buffer for matching text*/ Found = gScanFile.pScan(gScanFile.pScanContext, pBufCurr, &gScanFile.Details); BufferLines += gScanFile.Details.BufLineNr; if (! Found) { /*Scan portion of RE not found within buffer*/ TRACERY(SCANFILE_T_SCAN, { printf("Scan not found, buflines: %u\n", gScanFile.Details.BufLineNr); }); BufferLines++; break; } /*Do we need the line start but haven't found it?*/ if ((gScanFile.Details.pLineStart == NULL) && gScanFile.FindLineStart) { /*Yes, find start of matching line*/ gScanFile.Details.pLineStart = ((CHAR *) memrchr(gScanFile.Details.pMatchStart - 1, LF, ~0)) + 1; } TRACERY(SCANFILE_T_SCAN, { CHAR s[42]; Tracery_Decode(&gScanFile.TraceInfo, TRACERY_FLAGS, s, sizeof(s)); printf("\n%s (%s): ", Tracery_Name(&gScanFile.TraceInfo), s); printf("Found: %p, %p(%02x)..%p(%02x) ", gScanFile.Details.pLineStart, gScanFile.Details.pMatchStart, *gScanFile.Details.pMatchStart, gScanFile.Details.pMatchEnd, *gScanFile.Details.pMatchEnd); }); /*Find the end of the line found by the scan*/ gScanFile.Details.pLineEnd = memchr( gScanFile.Details.pMatchEnd, LF, ~0); TRACERY(SCANFILE_T_SCAN, { printf(" End: %p\n", gScanFile.Details.pLineEnd); }); /*Fast scan succeeded: is there a slow match as well?*/ if (gScanFile.pMatch == NULLFUNC) { /*No slow match: search is complete*/ goto Matched; } /*Find the end of the line found by the scan*/ gScanFile.pMatchContext->pAfterEndOfBuffer = gScanFile.Details.pLineEnd; Found = gScanFile.pMatch(gScanFile.pMatchContext, gScanFile.Details.pLineStart, &gScanFile.Details2); /*Did we match the harder (starting) bit?*/ if (! Found) { /*No, revert to scanning for easier bit*/ pBufCurr = gScanFile.Details.pLineEnd + 1; BufferLines++; /*Have we hit the end of the buffer?*/ if (pBufCurr >= gScanFile.pScanContext->pAfterEndOfBuffer) { /*Yes, finished this buffer*/ break; } continue; } /*Copy full details of match into main buffer*/ gScanFile.Details.pMatchStart = gScanFile.Details2.pMatchStart; gScanFile.Details.pMatchEnd = gScanFile.Details2.pMatchEnd; Matched: /*Are we reporting a line with normal termination?*/ if (gScanFile.Details.pLineEnd != gScanFile.pScanContext->pAfterEndOfBuffer) { /*Yes, include the terminator in the display*/ if (*gScanFile.Details.pLineEnd++ == CR) { gScanFile.Details.pLineEnd++; } } /*Are we selecting matching lines?*/ if (gScanFile.SelectMatchingLines) { /*Yes, remember that we've found at least one match*/ gScanFile.Details.LineMatchCount++; gScanFile.Details.LineNr += BufferLines; BufferLines = 1; /*Handle selected line*/ if ((gScanFile.pSelect) && (! gScanFile.pSelect(&gScanFile.Details))) { /*Function advises we may skip to next file*/ return FALSE; } } else { /*No, inverted match: are there preceding lines?*/ if (pInBuf != gScanFile.Details.pLineStart) { /*Yes, display them*/ if (! ScanFile_DisplayBlock( &gScanFile.Details, pInBuf, gScanFile.Details.pLineStart)) { /*We may skip to next file*/ return FALSE; } } else { /*No, still count this line*/ gScanFile.Details.LineNr++; } } /*Update search to start of next line*/ pBufCurr = gScanFile.Details.pLineEnd; pInBuf = pBufCurr; /*Have we hit the end of the buffer?*/ if (gScanFile.Details.pLineEnd >= gScanFile.pScanContext->pAfterEndOfBuffer) { /*Yes, finished this buffer*/ break; } } /*Is match sense inverted?*/ if (! gScanFile.SelectMatchingLines) { /*Yes, is there any unmatched text at the end of the buffer?*/ TRACERY(SCANFILE_T_SCAN, { printf("InvAtEnd: pInBuf, pAfterEnd: %p %p\n", pInBuf, gScanFile.pScanContext->pAfterEndOfBuffer); }); if (pInBuf < gScanFile.pScanContext->pAfterEndOfBuffer) { /*Yes, display it now*/ if (! ScanFile_DisplayBlock(&gScanFile.Details, pInBuf, gScanFile.pScanContext->pAfterEndOfBuffer)) { /*Display advises we may skip to next file*/ return FALSE; } } } else { /*Add in any remaining lines we counted at the end*/ gScanFile.Details.LineNr += BufferLines; } return TRUE; } /*SearchBuffer*/ /************************************************************************/ /* */ /* Search -- Perform specified search on a file */ /* */ /* This function searches the specified file (or stdin if */ /* pFilename is NULL) using the search options specified */ /* by Configure. The function returns FALSE if the search */ /* has determined that there's no benefit in examining any */ /* more files. */ /* */ /* Parameter pParent is used for recursive searches, so that */ /* circular loops in the directory heirarchy can be detected */ /* and avoided. External callers must specify NULL for this */ /* parameter. */ /* */ /************************************************************************/ public_scope BOOL ScanFile_Search(CHAR *pFilename, void *pParent) { BOOL Matched; UINT32 NrChars; BYTE *pInBuf; ScanFile_Stats Stats; /*Open file for scanning*/ if (! ScanFile_Open(pFilename, &Stats.stat)) { /*Unable to access file: skip to next file, if any*/ return TRUE; } /*Is this file a directory (and we are recursing directories)?*/ if (S_ISDIR(Stats.stat.st_mode)) { /*Yes, have we been asked to recurse directories?*/ if (gScanFile.RecurseDir) { /*Yes, enumerate files in this directory*/ Stats.pParent = pParent; /*?? Should close the opened handle?*/ return ScanFile_RecurseDir(pFilename, &Stats); } /*Sorry, we don't grep directories as binary files (yet)*/ /*Sorry, don't report skipped directories, either*/ return TRUE; } /*Initialise line match counter*/ gScanFile.Details.LineMatchCount = 0; /*Set up match details structure for reporting*/ gScanFile.Details.LineNr = 1; ReadFile: /*Get next buffer of file, if any*/ if (! FastFile_Read(gScanFile.pHandle, &pInBuf, &NrChars, &gScanFile.Details.BufferOffset)) { /*Error while reading buffer*/ printf("?? ScanFile_Search: FastFile read error\n"); return FALSE; } /*Did we read any characters?*/ if (NrChars != 0) { /*Yes, search the buffer we've received*/ TRACERY(SCANFILE_T_BUFFER, { UINT i; printf("ScanFile: Buffer %p..%p, %lu chars:", pInBuf, &pInBuf[NrChars], NrChars); for (i = 0; i < 6; i++) { printf(" %02x", pInBuf[i]); } printf("...\n"); }); gScanFile.Details.pBufferStart = pInBuf; /*Set up buffer end ptr (byte search and/or inverted match)*/ gScanFile.pScanContext->pAfterEndOfBuffer = &pInBuf[NrChars]; /*Destroy line end pointer in case there's no match*/ gScanFile.Details.pLineEnd = NULL; /*Is the buffer bigger than the backtracking size?*/ if (NrChars > gScanFile.pScanContext->BacktrackSize) { /*Yes, get RETable to allocate a suitable space*/ if (! RETable_AllocBacktrack(gScanFile.pScanContext, NrChars)) { /*Sorry, ran out of resources*/ fprintf(stderr, "%s: Not enough backtrack memory\n", Platform_ProgramName()); return TRUE; } } /*Search this buffer*/ if (ScanFile_SearchBuffer(pInBuf)) { /*Handle next buffer of file (if any)*/ goto ReadFile; } /*If we reach here, search isn't interested in file any more*/ } Matched = gScanFile.Details.LineMatchCount != 0; /*Were we asked to report if no lines matched within file?*/ if ((gScanFile.Details.ReportingOptions & MATCHENG_RPT_NONMATCH_FILES) && ! Matched) { /*Yes, report filename now and proceed to next file*/ gScanFile.pFilenameOut(&gScanFile.Details); gScanFile.MatchedAny = TRUE; return TRUE; } /*Accumulate status of match across all files*/ if (Matched) { gScanFile.MatchedAny = TRUE; } /*Were we asked to count lines?*/ if (gScanFile.Details.ReportingOptions & MATCHENG_RPT_LINECOUNT) { /*Yes, report count now (include filename if selected)*/ gScanFile.pFilenameOut(&gScanFile.Details); } /*Request that enumeration continue*/ return TRUE; } /*Search*/ /************************************************************************/ /* */ /* Pattern -- Specify RE to be searched */ /* */ /* pPattern is the "compiled" version created by RegExp. */ /* ScanOptions allows modifications to the pattern such as */ /* case insensitivity, word match and inverted match sense */ /* to be specified. */ /* */ /* Pattern expands the RE into a version optimised for speed, */ /* and return FALSE if it is unable to handle the RE */ /* (for example, if it runs out of RAM). */ /* */ /************************************************************************/ public_scope BOOL ScanFile_Pattern(RegExp_Specification *pPattern, LWORD ScanOptions) { RegExp_Specification *pEasyBit = NULL; RegExp_Specification *pHarderBit = NULL; BOOL IgnoreCase; STBM_SearchSpec *pSTBM; LWORD ScanFlags = MATCHENG_SPEC_SKIP_BYTES | MATCHENG_SPEC_ENDMARKER(SCANFILE_ENDMARKER_DEFAULT); LWORD MatchFlags = 0; UINT PatternLength; BYTE TrailingLiteral; /*Does the main search reference a valid RE specification?*/ if (pPattern == NULL) { /*No, fault in configuration specification*/ return FALSE; } /*Allowable optimisations given in ScanOptions*/ if (ScanOptions & SCANFILE_DEBUG_COMPILED) { RegExp_ShowCodes("RE.Original: ", pPattern); } /*Does the client want CR/LF line termination as well as LF?*/ if (ScanOptions & SCANFILE_OPT_CR_IS_TERMINATOR) { /*Yes, tell match engine to set up to support this*/ MatchFlags |= MATCHENG_SPEC_CR_IS_TERMINATOR; ScanFlags |= MATCHENG_SPEC_CR_IS_TERMINATOR; } /*Does the client want to recurse directories?*/ gScanFile.RecurseDir = FALSE; if (ScanOptions & SCANFILE_OPT_RECURSE_DIRECTORIES) { /*Yes, remember this for when we're dealing with files*/ gScanFile.RecurseDir = TRUE; } /*Default to no "easy" search before match search*/ gScanFile.pMatchContext = NULL; /*Does the client want us to count matching lines?*/ if (ScanOptions & SCANFILE_NUMBER_MATCHING_LINES) { /*Yes, can't use search optimisations that skip bytes*/ ScanOptions &= ~SCANFILE_OPT_SKIP; /*Tell RE engine to include line counting and disallow skip*/ ScanFlags &= ~MATCHENG_SPEC_SKIP_BYTES; ScanFlags |= MATCHENG_SPEC_COUNT_LINES; } /*Does the client want us to count nonmatching lines?*/ if (ScanOptions & SCANFILE_NUMBER_NONMATCH_LINES) { /*Need to break blocks of text into lines*/ gScanFile.UnpackBlocks = TRUE; } /*Is the client happy with only an approximate match?*/ if (ScanOptions & SCANFILE_OPT_APPROXIMATE) { /*Yes, remove optional first/last elements that slow us down*/ if (RegExp_SlashEnds(pPattern)) { /*RE has been modified to simplify things*/ if (ScanOptions & SCANFILE_DEBUG_COMPILED) { RegExp_ShowCodes("RE.Approx: ", pPattern); } } } /*Default to no match function*/ gScanFile.pMatch = NULLFUNC; /*Can the entire RE be searched using STBM?*/ pSTBM = NULL; if ((ScanOptions & SCANFILE_OPT_SKIP) && (ScanOptions & SCANFILE_OPT_SELF_TUNED_BM)) { pSTBM = STBMShim_Pattern(pPattern, &PatternLength, &IgnoreCase, &TrailingLiteral); } if (pSTBM != NULL) { /*Yes, does the caller want to display the tables?*/ if (! (ScanOptions & SCANFILE_DEBUG_DISPLAY)) { /*No, can skip a lot of unnecessary setup code*/ goto AfterTableAnalysis; } } /*Is there an easier bit to search in the middle of the RE?*/ gScanFile.pScan = gScanFile.pExternMatchFunc; if ((ScanOptions & SCANFILE_OPT_EASIEST_FIRST) && RegExp_EasiestFirst(pPattern, &pEasyBit)) { /*Yes, modify search to scan for that part first*/ pHarderBit = pPattern; pPattern = pEasyBit; gScanFile.FindLineStart = TRUE; if (ScanOptions & SCANFILE_DEBUG_COMPILED) { RegExp_ShowCodes("RE.Easy: ", pPattern); } /*Use function to check for full match after easy bit found*/ gScanFile.pMatch = gScanFile.pExternMatchFunc; /*Expand hard bit to line-based search*/ MatchFlags |= MATCHENG_SPEC_ENDMARKER(LF); if (! RETable_Expand(pHarderBit, MatchFlags, &gScanFile.pMatchContext)) { /*Expansion failed for some reason*/ printf("RETable.Expand (match) failed\n"); return FALSE; } } /*Are we allowed to attempt optimisations that skip bytes?*/ if (ScanOptions & SCANFILE_OPT_SKIP) { /*Yes, may we try to use self-tuning Boyer-Moore algorithm?*/ if (ScanOptions & SCANFILE_OPT_SELF_TUNED_BM) { /*Yes, see if the algorithm can handle the search*/ pSTBM = STBMShim_Pattern(pPattern, &PatternLength, &IgnoreCase, &TrailingLiteral); } } /*FALLTHROUGH*/ AfterTableAnalysis: /*If no STBM or if table display, expand RE into table-driven format*/ if ((pSTBM == NULL) || (ScanOptions & SCANFILE_DEBUG_DISPLAY)) { /*Expand compact RE spec into table-driven version*/ if (! RETable_Expand(pPattern, ScanFlags, &gScanFile.pScanContext)) { /*Expansion failed for some reason*/ fprintf(stderr, "RETable.Expand (scan) failed\n"); return FALSE; } } else { /*Using STBM, allocate scan context*/ if (! ScanFile_NewScanContext(&gScanFile.pScanContext)) { /*Sorry, unable to set up scan context*/ fprintf(stderr, "STBM scan context error\n"); return FALSE; } } /*Are we using STBM?*/ if (pSTBM != NULL) { /*Yes, set up scan context*/ gScanFile.pScanContext->PatternLength = PatternLength; gScanFile.pScanContext->TrailingLiteral = TrailingLiteral; gScanFile.pScanContext->EndCondition = MATCHENG_CONDITION_TRAILING_LITERAL; /*Configure search to use STBM interface*/ if (IgnoreCase) { gScanFile.pScan = STBMShim_SearchInCase; } else { /*Select STBM or TBM as appropriate*/ if (ScanOptions & SCANFILE_OPT_TUNED_BM) { /*Caller wants Tuned BM for comparison*/ gScanFile.pScan = STBMShim_SearchTBM; } else { /*Use behoffski's self-tuned BM*/ gScanFile.pScan = STBMShim_Search; } } gScanFile.pScanContext->pSpare1 = pSTBM; TRACERY(SCANFILE_T_SCAN, { printf("\nScanfile: Using STBM"); }); } /*Display tables if requested*/ if (ScanOptions & SCANFILE_DEBUG_DISPLAY) { TblDisp_Describe(gScanFile.pScanContext, "Scan"); /*Does the RE have a match component as well?*/ if (gScanFile.pMatchContext != NULL) { /*Yes, display it*/ TblDisp_Describe(gScanFile.pMatchContext, "Match"); } } /*Report success to caller*/ return TRUE; } /*Pattern*/ /************************************************************************/ /* */ /* Configure -- Define how the module searches and reports matches */ /* */ /************************************************************************/ public_scope BOOL ScanFile_Configure(LWORD ReportingOptions) { TRACERY(SCANFILE_T_MATCH, { printf("Scanfile_Configure(%08lx)\n", ReportingOptions); }); /*Record reporting options and select functions accordingly*/ gScanFile.Details.ReportingOptions = ReportingOptions; /*Default to normal reporting of lines*/ gScanFile.pSelect = gScanFile.pNormalOut; gScanFile.FindLineStart = TRUE; /*Does the client want to see lines?*/ if (! (ReportingOptions & MATCHENG_RPT_LINE)) { /*No, just show filename on match and finish file*/ gScanFile.pSelect = gScanFile.pFilenameOut; gScanFile.FindLineStart = FALSE; } /*Does the client want line counting?*/ if (ReportingOptions & MATCHENG_RPT_LINECOUNT) { /*Yes, use match count function*/ gScanFile.pSelect = NULLFUNC; } /*Does the client want highlighted matches?*/ if (ReportingOptions & MATCHENG_RPT_HIGHLIGHT) { /*Yes, use platform-specific function*/ gScanFile.pSelect = gScanFile.pHighlightOut; } /*Has the client requested that matches be reported with delimiters?*/ if (ReportingOptions & MATCHENG_RPT_MARKER_FLAG) { /*Yes, use platform-specific function and unpack char*/ gScanFile.pSelect = gScanFile.pHighlightOut; gScanFile.Details.MarkerChar = MATCHENG_RPT_MARKER_UNPACK(ReportingOptions); } /*Does the client want to see nonmatch files?*/ if (ReportingOptions & MATCHENG_RPT_NONMATCH_FILES) { /*Yes, abandon file as soon as match found*/ gScanFile.pSelect = ScanFile_MatchedAbandon; gScanFile.FindLineStart = FALSE; } /*Does the client want to report non-matching lines?*/ gScanFile.SelectMatchingLines = TRUE; if (ReportingOptions & MATCHENG_RPT_INVERT_MATCH_SENSE) { /*Yes, select inverted match sense*/ gScanFile.SelectMatchingLines = FALSE; gScanFile.FindLineStart = TRUE; } /*Does the client want to add information to each line?*/ if (ReportingOptions & (MATCHENG_RPT_LINENUMBER | MATCHENG_RPT_BYTEOFFSET | MATCHENG_RPT_FILENAME | MATCHENG_RPT_LINECOUNT | MATCHENG_RPT_REMOVE_TRAILING_CR)) { /*Yes, remember to break apart blocks if inverted sense*/ gScanFile.UnpackBlocks = TRUE; gScanFile.FindLineStart = TRUE; } /*Configured module successfully*/ return TRUE; } /*Configure*/ /************************************************************************/ /* */ /* MatchedAny -- Report if any files matched search criteria */ /* */ /************************************************************************/ public_scope BOOL ScanFile_MatchedAny(void) { /*Was internal flag set?*/ if (gScanFile.MatchedAny) { /*Yes, clear it and then report state to caller*/ gScanFile.MatchedAny = FALSE; return TRUE; } /*Internal flag was not set*/ return FALSE; } /*MatchedAny*/ /************************************************************************/ /* */ /* MatchFunction -- Define routine to perform match */ /* */ /* ScanFile wishes to provide extreme high-performance searches */ /* to do so in a very portable fashion. This function is the */ /* result: ScanFile receives the address of the function that */ /* implements the match from an outsider (usually Platform). */ /* This function must be called before Pattern. */ /* */ /************************************************************************/ public_scope void ScanFile_MatchFunction(MatchEng_MatchFunction pMatchFunc) { /*Remember function for operation*/ gScanFile.pExternMatchFunc = pMatchFunc; } /*MatchFunction*/ /************************************************************************/ /* */ /* NoMatchFunction -- Place-holder to warn of incorrect config */ /* */ /* This function is called if ScanFile does not receive a */ /* match function appropriate to the platform. */ /* */ /************************************************************************/ module_scope BOOL ScanFile_NoMatchFunction(MatchEng_Spec *pTable, BYTE *pText, MatchEng_Details *pDetails) { fprintf(stderr, "Scanfile: No match function provided!"); return FALSE; } /*NoMatchFunction*/ /************************************************************************/ /* */ /* OutputFunctions -- Specify functions to perform match output */ /* */ /* In order to keep ScanFile as portable as possible, the */ /* match and filename display functions are provided by */ /* an external party, since generation and display of */ /* output (and especially highlighting) is platform-specific. */ /* */ /* This function must be called after Init but before any */ /* RE specification or module configuration. */ /* */ /************************************************************************/ public_scope void ScanFile_OutputFunctions(MatchEng_SelectFunction *pNormal, MatchEng_SelectFunction *pHighlight, MatchEng_SelectFunction *pFilenameOut) { /*Record the functions to use for later*/ gScanFile.pNormalOut = pNormal; gScanFile.pHighlightOut = pHighlight; gScanFile.pFilenameOut = pFilenameOut; } /*OutputFunctions*/ #ifdef TRACERY_ENABLED /************************************************************************/ /* */ /* TraceryLink -- Tell Tracery how to deal with us */ /* */ /* This procedure is used by Tracery to find out how to */ /* manipulate the trace flags for this module and/or object. */ /* The platform should be able to hand this routine to */ /* Tracery when setting up the system without needing to */ /* know too many details about how the traces are to be */ /* set up. */ /* */ /* This function may be used to get the flags for the */ /* module, or for any object created by the module. */ /* If the pObject parameter is NULL, the module information */ /* is returned; otherwise, the object's info is returned. */ /* Currently we report our flag register, our preferred */ /* set of default flags, and a list of edit specifiers and */ /* bits to edit in the flag register. In the future this */ /* may change: Tracery is still rather tentative. */ /* */ /************************************************************************/ public_scope BOOL ScanFile_TraceryLink(void *pObject, UINT Opcode, ...) { Tracery_ObjectInfo **ppInfoBlock; LWORD *pDefaultFlags; Tracery_EditEntry **ppEditList; va_list ap; va_start(ap, Opcode); switch (Opcode) { case TRACERY_REGCMD_GET_INFO_BLOCK: /*Report module's block (we don't support objects as yet)*/ ppInfoBlock = va_arg(ap, Tracery_ObjectInfo **); *ppInfoBlock = &gScanFile.TraceInfo; break; case TRACERY_REGCMD_GET_DEFAULT_FLAGS: pDefaultFlags = va_arg(ap, LWORD *); *pDefaultFlags = SCANFILE_T_BUFFER | SCANFILE_T_SCAN | SCANFILE_T_MATCH | SCANFILE_T_DIR; break; case TRACERY_REGCMD_GET_EDIT_LIST: ppEditList = va_arg(ap, Tracery_EditEntry **); *ppEditList = gScanFile_TraceryEditDefs; break; default: /*Unsupported opcode*/ va_end(ap); return FALSE; } va_end(ap); return TRUE; } /*TraceryLink*/ #endif /*TRACERY_ENABLED*/ /************************************************************************/ /* */ /* Init -- Prepare module for operation */ /* */ /************************************************************************/ public_scope void ScanFile_Init(void) { /*No traces enabled by default*/ TRACERY_CLEAR_ALL_FLAGS(&TRACERY_MODULE_INFO); /*Initialise selection reporting details*/ gScanFile.Details.LineMatchCount = 0; /*No match function provided initially*/ gScanFile.pMatch = ScanFile_NoMatchFunction; gScanFile.pScanContext = NIL; /*Configure default scanning options (display matching lines)*/ gScanFile.SelectMatchingLines = TRUE; /*By default, we don't recurse directories*/ gScanFile.RecurseDir = FALSE; /*Assume that we don't need to find the line start*/ gScanFile.FindLineStart = FALSE; /*No platform-specific display functions provided yet*/ gScanFile.pNormalOut = NULLFUNC; gScanFile.pHighlightOut = NULLFUNC; gScanFile.pFilenameOut = NULLFUNC; gScanFile.pSelect = NULLFUNC; /*Initialise FastFile configuration memory constant*/ gScanFile.PrecedingLF = LF; /*Don't display any debug information*/ gScanFile.Debug = 0; /*Default to treating blocks of file with minimal overhead*/ gScanFile.UnpackBlocks = FALSE; } /*Init*/