00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #include <cctype>
00027 #include <cstdarg>
00028 #include <cstdlib>
00029 #include <cstring>
00030 #include <mpcl/text/string.hh>
00031 #include <mpcl/text/regex/matcher.hh>
00032
00033
00034
00035
00036
00037
00038 #define REM_SEPARATOR "[[:space:]]"
00039 #define REM_SEPARATOR_PLUS REM_SEPARATOR "+"
00040 #define REM_SEPARATOR_ASTERISK REM_SEPARATOR "*"
00041 #define REM_BLANK "[[:blank:]]"
00042 #define REM_BLANK_PLUS REM_BLANK "+"
00043 #define REM_BLANK_ASTERISK REM_BLANK "*"
00044 #define REM_ANY_ASTERISK "[.]*"
00045 #define REM_STRING "%s"
00046 #define REM_INTEGER "%d"
00047 #define REM_QUOTABLE_STRING "%q"
00048 #define REM_TEXT "%t"
00049 #define REM_EOF "<<EOF>>"
00050
00051
00052
00053
00054
00055
00057 namespace mpcl
00058 {
00059
00061 namespace text
00062 {
00063
00065 namespace regex
00066 {
00067
00068 static const char* _pkcAnyAsteriskPattern = REM_ANY_ASTERISK;
00069 static const char* _pkcBlankAsteriskPattern = REM_BLANK_ASTERISK;
00070 static const char* _pkcBlankPattern = REM_BLANK;
00071 static const char* _pkcBlankPlusPattern = REM_BLANK_PLUS;
00072 static const char* _pkcEofPattern = REM_EOF;
00073 static const char* _pkcIntegerPattern = REM_INTEGER;
00074 static const char* _pkcQuotableStringPattern = REM_QUOTABLE_STRING;
00075 static const char* _pkcSeparatorAsteriskPattern = REM_SEPARATOR_ASTERISK;
00076 static const char* _pkcSeparatorPattern = REM_SEPARATOR;
00077 static const char* _pkcSeparatorPlusPattern = REM_SEPARATOR_PLUS;
00078 static const char* _pkcStringPattern = REM_STRING;
00079 static const char* _pkcTextPattern = REM_TEXT;
00080
00081 static std::size_t _zAnyAsteriskPatternLength = std::strlen (_pkcAnyAsteriskPattern);
00082 static std::size_t _zBlankAsteriskPatternLength = std::strlen (_pkcBlankAsteriskPattern);
00083 static std::size_t _zBlankPatternLength = std::strlen (_pkcBlankPattern);
00084 static std::size_t _zBlankPlusPatternLength = std::strlen (_pkcBlankPlusPattern);
00085 static std::size_t _zEofPatternLength = std::strlen (_pkcEofPattern);
00086 static std::size_t _zIntegerPatternLength = std::strlen (_pkcIntegerPattern);
00087 static std::size_t _zQuotableStringPatternLength = std::strlen (_pkcQuotableStringPattern);
00088 static std::size_t _zSeparatorAsteriskPatternLength = std::strlen (_pkcSeparatorAsteriskPattern);
00089 static std::size_t _zSeparatorPatternLength = std::strlen (_pkcSeparatorPattern);
00090 static std::size_t _zSeparatorPlusPatternLength = std::strlen (_pkcSeparatorPlusPattern);
00091 static std::size_t _zStringPatternLength = std::strlen (_pkcStringPattern);
00092 static std::size_t _zTextPatternLength = std::strlen (_pkcTextPattern);
00093
00094 }
00095
00096 }
00097
00098 }
00099
00100 #undef REM_SEPARATOR
00101 #undef REM_SEPARATOR_PLUS
00102 #undef REM_SEPARATOR_ASTERISK
00103 #undef REM_BLANK
00104 #undef REM_BLANK_PLUS
00105 #undef REM_BLANK_ASTERISK
00106 #undef REM_ANY_ASTERISK
00107 #undef REM_STRING
00108 #undef REM_INTEGER
00109 #undef REM_QUOTABLE_STRING
00110 #undef REM_TEXT
00111 #undef REM_EOF
00112
00113
00114
00115
00116
00117
00118 void mpcl::text::regex::TMatcher::
00119 clearDefinitions (void)
00120 {
00121
00122 tTerminalMap.clear();
00123
00124 }
00125
00126
00127 void mpcl::text::regex::TMatcher::
00128 clearStream (void)
00129 {
00130
00131 if ( gLocalStream )
00132 {
00133 if ( !ptSourceIstream )
00134 {
00135 throw TNoStreamException ("bad stream", __FILE__, __LINE__);
00136 }
00137 else
00138 {
00139 delete ptSourceIstream;
00140 }
00141 ptSourceIstream = NULL;
00142
00143 gLocalStream = false;
00144 }
00145
00146 }
00147
00148
00149 void mpcl::text::regex::TMatcher::
00150 define ( const char* pkcTERMINAL ,
00151 const char* pkcDEFINITION )
00152 {
00153
00154 tTerminalMap.bind (pkcTERMINAL, pkcDEFINITION);
00155
00156 }
00157
00158
00159 bool mpcl::text::regex::TMatcher::
00160 match (const char* pkcPATTERN_STRING) const
00161 {
00162
00163 bool gSuccess = false;
00164
00165 if ( !ptSourceIstream )
00166 {
00167 throw TNoStreamException ("bad stream", __FILE__, __LINE__);
00168 }
00169 else
00170 {
00171 std::ios::iostate tInitialIostate = ptSourceIstream->rdstate();
00172 std::streampos tInitialStreampos = ptSourceIstream->tellg();
00173
00174 gSuccess = ( scan (pkcPATTERN_STRING, NULL) > 0 );
00175 if ( gSuccess )
00176 {
00177
00178
00179
00180
00181 ptSourceIstream->seekg (tInitialStreampos);
00182 ptSourceIstream->clear (tInitialIostate);
00183 }
00184 }
00185 return gSuccess;
00186
00187 }
00188
00189
00190 void mpcl::text::regex::TMatcher::
00191 redefine ( const char* pkcTERMINAL ,
00192 const char* pkcDEFINITION )
00193 {
00194
00195 tTerminalMap [pkcTERMINAL] = pkcDEFINITION;
00196
00197 }
00198
00199
00200 std::size_t mpcl::text::regex::TMatcher::
00201 scan (const char* pkcPATTERN_STRING...) const
00202 {
00203
00204 using std::size_t;
00205 using std::streampos;
00206 using std::string;
00207
00208 size_t zTotalMatchedChars = 0;
00209
00210 if ( !ptSourceIstream )
00211 {
00212 throw TNoStreamException ("bad stream", __FILE__, __LINE__);
00213 }
00214 else
00215 {
00216
00217
00218
00219
00220
00221
00222
00223 va_list tVa_list;
00224 int iNextCharacter;
00225 bool gMoreArguments = true;
00226 TString yPatternInstance = instantiate (pkcPATTERN_STRING);
00227 char* pcPatternIterator = (char*) yPatternInstance.c_str();
00228 std::ios::iostate tInitialIostate = ptSourceIstream->rdstate();
00229 std::streampos tInitialStreampos = ptSourceIstream->tellg();
00230 string* pyArgument = NULL;
00231 int* piArgument = NULL;
00232
00233 va_start (tVa_list, pkcPATTERN_STRING);
00234 while ( *pcPatternIterator && ptSourceIstream->good() )
00235 {
00236 iNextCharacter = ptSourceIstream->peek();
00237 if ( iNextCharacter == EOF )
00238 {
00239 break;
00240 }
00241
00242
00243
00244
00245 if ( !strncmp ( _pkcSeparatorPlusPattern ,
00246 pcPatternIterator ,
00247 _zSeparatorPlusPatternLength ) )
00248 {
00249 if ( !isspace (iNextCharacter) )
00250 {
00251 break;
00252 }
00253 else
00254 {
00255 ptSourceIstream->get();
00256 ++zTotalMatchedChars;
00257 while ( isspace (ptSourceIstream->peek()) )
00258 {
00259 ptSourceIstream->get();
00260 ++zTotalMatchedChars;
00261 }
00262 pcPatternIterator += _zSeparatorPlusPatternLength;
00263 continue;
00264 }
00265 }
00266
00267
00268
00269 if ( !strncmp ( _pkcSeparatorAsteriskPattern ,
00270 pcPatternIterator ,
00271 _zSeparatorAsteriskPatternLength ) )
00272 {
00273 while ( isspace (ptSourceIstream->peek()) )
00274 {
00275 ptSourceIstream->get();
00276 ++zTotalMatchedChars;
00277 }
00278 pcPatternIterator += _zSeparatorAsteriskPatternLength;
00279 continue;
00280 }
00281
00282
00283
00284 if ( !strncmp ( _pkcSeparatorPattern ,
00285 pcPatternIterator ,
00286 _zSeparatorPatternLength ) )
00287 {
00288 if ( !isspace (ptSourceIstream->peek()) )
00289 {
00290 break;
00291 }
00292 else
00293 {
00294 ptSourceIstream->get();
00295 ++zTotalMatchedChars;
00296 pcPatternIterator += _zSeparatorPatternLength;
00297 continue;
00298 }
00299 }
00300
00301
00302
00303 if ( !strncmp (_pkcBlankPlusPattern, pcPatternIterator, _zBlankPlusPatternLength) )
00304 {
00305 if ( ( iNextCharacter != ' ' ) && ( iNextCharacter != '\t' ) )
00306 {
00307 break;
00308 }
00309 else
00310 {
00311 ptSourceIstream->get();
00312 ++zTotalMatchedChars;
00313 iNextCharacter = ptSourceIstream->peek();
00314 while ( ( iNextCharacter == ' ' ) || ( iNextCharacter == '\t') )
00315 {
00316 ptSourceIstream->get();
00317 iNextCharacter = ptSourceIstream->peek();
00318 ++zTotalMatchedChars;
00319 }
00320 pcPatternIterator += _zBlankPlusPatternLength;
00321 continue;
00322 }
00323 }
00324
00325
00326
00327 if ( !strncmp ( _pkcBlankAsteriskPattern ,
00328 pcPatternIterator ,
00329 _zBlankAsteriskPatternLength ) )
00330 {
00331 while ( ( iNextCharacter == ' ' ) || ( iNextCharacter == '\t') )
00332 {
00333 ptSourceIstream->get();
00334 iNextCharacter = ptSourceIstream->peek();
00335 ++zTotalMatchedChars;
00336 }
00337 pcPatternIterator += _zBlankAsteriskPatternLength;
00338 continue;
00339 }
00340
00341
00342
00343 if ( !strncmp (_pkcAnyAsteriskPattern, pcPatternIterator, _zAnyAsteriskPatternLength) )
00344 {
00345 const char* pkcNextPattern = pcPatternIterator + _zAnyAsteriskPatternLength;
00346
00347 if ( *pkcNextPattern )
00348 {
00349 while ( ( iNextCharacter != EOF ) && !match (pkcNextPattern) )
00350 {
00351 ptSourceIstream->get();
00352 iNextCharacter = ptSourceIstream->peek();
00353 ++zTotalMatchedChars;
00354 }
00355 }
00356 else
00357 {
00358
00359
00360
00361
00362 while ( iNextCharacter != EOF )
00363 {
00364 ptSourceIstream->get();
00365 iNextCharacter = ptSourceIstream->peek();
00366 ++zTotalMatchedChars;
00367 }
00368 }
00369 pcPatternIterator = (char*) pkcNextPattern;
00370 continue;
00371 }
00372
00373
00374
00375 if ( !strncmp (_pkcBlankPattern, pcPatternIterator, _zBlankPatternLength) )
00376 {
00377 if ( ( iNextCharacter != ' ' ) && ( iNextCharacter != '\t' ) )
00378 {
00379 break;
00380 }
00381 else
00382 {
00383 ptSourceIstream->get();
00384 ++zTotalMatchedChars;
00385 pcPatternIterator += _zBlankPatternLength;
00386 continue;
00387 }
00388 }
00389
00390
00391
00392 if ( !strncmp (_pkcEofPattern, pcPatternIterator, _zEofPatternLength) )
00393 {
00394
00395
00396
00397 break;
00398 }
00399
00400
00401
00402 if ( !strncmp (_pkcStringPattern, pcPatternIterator, _zStringPatternLength) )
00403 {
00404 string yMatched;
00405
00406 while ( ( isalnum (iNextCharacter) ) ||
00407 ( iNextCharacter == '_' ) ||
00408 ( iNextCharacter == '-' ) ||
00409 ( iNextCharacter == '.' ) )
00410 {
00411 yMatched += ptSourceIstream->get();
00412 iNextCharacter = ptSourceIstream->peek();
00413 ++zTotalMatchedChars;
00414 }
00415 pcPatternIterator += _zStringPatternLength;
00416 if ( gMoreArguments )
00417 {
00418 pyArgument = va_arg (tVa_list, string*);
00419 gMoreArguments = ( pyArgument != NULL );
00420 if ( gMoreArguments )
00421 {
00422 *pyArgument = yMatched;
00423 }
00424 }
00425 continue;
00426 }
00427
00428
00429
00430 if ( !strncmp (_pkcIntegerPattern, pcPatternIterator, _zIntegerPatternLength) )
00431 {
00432 TString yMatched;
00433
00434 while ( isdigit (iNextCharacter) )
00435 {
00436 yMatched += ptSourceIstream->get();
00437 iNextCharacter = ptSourceIstream->peek();
00438 ++zTotalMatchedChars;
00439 }
00440 pcPatternIterator += _zStringPatternLength;
00441 if ( gMoreArguments )
00442 {
00443 piArgument = va_arg (tVa_list, int*);
00444 gMoreArguments = ( piArgument != NULL );
00445 if ( gMoreArguments )
00446 {
00447 *piArgument = atoi (yMatched.c_str());
00448 }
00449 }
00450 continue;
00451 }
00452
00453
00454
00455 if ( !strncmp ( _pkcQuotableStringPattern ,
00456 pcPatternIterator ,
00457 _zQuotableStringPatternLength ) )
00458 {
00459 TString yMatched;
00460
00461 iNextCharacter = ptSourceIstream->peek();
00462 if ( iNextCharacter == EOF )
00463 {
00464 continue;
00465 }
00466 else
00467 {
00468 if ( iNextCharacter != '"' )
00469 {
00470
00471
00472
00473 while ( ( isalnum (iNextCharacter) ) ||
00474 ( iNextCharacter == '_' ) ||
00475 ( iNextCharacter == '-' ) ||
00476 ( iNextCharacter == '.' ) )
00477 {
00478 yMatched += ptSourceIstream->get();
00479 iNextCharacter = ptSourceIstream->peek();
00480 ++zTotalMatchedChars;
00481 }
00482 }
00483 else
00484 {
00485
00486
00487
00488 ptSourceIstream->get();
00489 ++zTotalMatchedChars;
00490 iNextCharacter = ptSourceIstream->peek();
00491 while ( ( iNextCharacter != '"' ) && ( iNextCharacter != EOF ) )
00492 {
00493 yMatched += ptSourceIstream->get();
00494 iNextCharacter = ptSourceIstream->peek();
00495 ++zTotalMatchedChars;
00496 }
00497 if ( iNextCharacter != '"' )
00498 {
00499 break;
00500 }
00501 else
00502 {
00503 ptSourceIstream->get();
00504 ++zTotalMatchedChars;
00505 }
00506 }
00507 pcPatternIterator += _zQuotableStringPatternLength;
00508 if ( gMoreArguments )
00509 {
00510 pyArgument = va_arg (tVa_list, string*);
00511 gMoreArguments = ( pyArgument != NULL );
00512 if ( gMoreArguments )
00513 {
00514 *pyArgument = yMatched;
00515 }
00516 }
00517 continue;
00518 }
00519 }
00520
00521
00522
00523 if ( !strncmp (_pkcTextPattern, pcPatternIterator, _zTextPatternLength) )
00524 {
00525 TString yMatched;
00526 const char* pkcNextPattern = pcPatternIterator + _zTextPatternLength;
00527
00528 if ( *pkcNextPattern )
00529 {
00530 while ( ( iNextCharacter != EOF ) && !match (pkcNextPattern) )
00531 {
00532 yMatched += ptSourceIstream->get();
00533 iNextCharacter = ptSourceIstream->peek();
00534 ++zTotalMatchedChars;
00535 }
00536 }
00537 else
00538 {
00539
00540
00541
00542
00543 while ( iNextCharacter != EOF )
00544 {
00545 yMatched += ptSourceIstream->get();
00546 iNextCharacter = ptSourceIstream->peek();
00547 ++zTotalMatchedChars;
00548 }
00549 }
00550 pcPatternIterator = (char*) pkcNextPattern;
00551 if ( gMoreArguments )
00552 {
00553 pyArgument = va_arg (tVa_list, string*);
00554 gMoreArguments = ( pyArgument != NULL );
00555 if ( gMoreArguments )
00556 {
00557 *pyArgument = yMatched;
00558 }
00559 }
00560 continue;
00561 }
00562
00563
00564
00565
00566 if ( iNextCharacter != EOF )
00567 {
00568 if ( matchChars (iNextCharacter, pcPatternIterator) )
00569 {
00570 ++zTotalMatchedChars;
00571 ptSourceIstream->get();
00572 }
00573 else
00574 {
00575 if ( *(pcPatternIterator + 1) != '?' )
00576 {
00577 break;
00578 }
00579 }
00580 if ( *(pcPatternIterator + 1) == '?' )
00581 {
00582 ++pcPatternIterator;
00583 }
00584 ++pcPatternIterator;
00585 }
00586 }
00587 if ( *pcPatternIterator )
00588 {
00589
00590
00591
00592 if ( !strcmp (_pkcEofPattern, pcPatternIterator) )
00593 {
00594 if ( iNextCharacter != EOF )
00595 {
00596 zTotalMatchedChars = 0;
00597 ptSourceIstream->seekg (tInitialStreampos);
00598 ptSourceIstream->clear (tInitialIostate);
00599 }
00600 }
00601 else
00602 {
00603 zTotalMatchedChars = 0;
00604 ptSourceIstream->seekg (tInitialStreampos);
00605 ptSourceIstream->clear (tInitialIostate);
00606 }
00607 }
00608 va_end (tVa_list);
00609 }
00610 return zTotalMatchedChars;
00611
00612 }
00613
00614
00615 void mpcl::text::regex::TMatcher::
00616 setCaseSensitiveness (bool gTRUTH)
00617 {
00618
00619 gCaseSensitive = gTRUTH;
00620
00621 }
00622
00623
00624 void mpcl::text::regex::TMatcher::
00625 setInput (const char* pkcSTRING)
00626 {
00627
00628 clearStream();
00629 ptSourceIstream = new std::basic_istringstream<char> (pkcSTRING);
00630 gLocalStream = true;
00631 if ( pkcSTRING && std::strlen (pkcSTRING) )
00632 {
00633 checkStream();
00634 }
00635
00636 }
00637
00638
00639 void mpcl::text::regex::TMatcher::
00640 setInput (std::basic_istream<char>& rtSOURCE_ISTREAM)
00641 {
00642
00643 clearStream();
00644 ptSourceIstream = &rtSOURCE_ISTREAM;
00645 checkStream();
00646
00647 }
00648
00649
00650
00651
00652
00653
00654 void mpcl::text::regex::TMatcher::
00655 checkStream (void) const
00656 {
00657
00658 if ( ptSourceIstream )
00659 {
00660 if ( !ptSourceIstream->eof() )
00661 {
00662
00663
00664
00665
00666 if ( EOF == ptSourceIstream->rdbuf()->pubseekoff (0, std::ios_base::cur, std::ios_base::in) )
00667 {
00668 throw TNotRePositionableStreamException ("bad stream", __FILE__, __LINE__);
00669 }
00670 }
00671 }
00672
00673 }
00674
00675
00676 mpcl::text::TString mpcl::text::regex::TMatcher::
00677 instantiate (const char* pkcPATTERN_STRING) const
00678 {
00679
00680 TStringToStringMap::const_iterator ktIter = tTerminalMap.begin();
00681 TStringToStringMap::const_iterator ktEnd = tTerminalMap.end();
00682 TString yInstance (pkcPATTERN_STRING);
00683
00684 for (; ( ktIter != ktEnd ) ;++ktIter)
00685 {
00686 yInstance.replaceAll (ktIter->first, ktIter->second);
00687 }
00688 return yInstance;
00689
00690 }
00691
00692
00693 bool mpcl::text::regex::TMatcher::
00694 matchChars (char cSOURCE, const char* pkcPATTERN_STRING) const
00695 {
00696
00697 bool gSuccess;
00698
00699 if ( gCaseSensitive )
00700 {
00701 gSuccess = ( *pkcPATTERN_STRING == cSOURCE );
00702 }
00703 else
00704 {
00705 gSuccess = ( tolower (*pkcPATTERN_STRING) == tolower (cSOURCE) );
00706 }
00707 return gSuccess;
00708
00709 }