Back

1     //------------------------------------------------------------------------------
2     // Module: Regex.cpp                                                          //
3     //                                                                            //
4     //    Class which encapsulates a very simple regular expression parser        //
5     //                                                                            //
6     //    Copyright (c) 2000-2004 by Lars Haendel                                 //
7     //    Home: http://www.newty.de                                               //
8     //                                                                            //
9     //    This program is free software and can be used under the terms of the    //
10    //    GNU licence. See header-file for further information and disclaimer.    //
11    //                                                                            //
12    //------------------------------------------------------------------------------
13
14    #include <stdio>                       //          sprintf()
15
16    #include "Regex.h"
17    #include "NameUtil.h"                  //          IsAlpha() etc.
18    #include "ErrText.h"                   //          TErrText
19
20
21    //----------------------------------------------------------------------------------------------------------------------
22    // The "one character" functions (Code "\0" to "\7")
23
24    // code "/0", return '1' if actual character (szString[pos]) is an arbitrary character
25    int Func0(const char*const& szString, const int& pos, const char&, void*)
26    {  if(szString[pos]!='\0') return 1;
27       else return -1; }
28
29    // code "/1", return '1' if actual character (szString[pos]) is a whitespace
30    int Func1(const char*const& szString, const int& pos, const char&, void*)
31    {  if(IsSpace(szString[pos])) return 1;
32       else return -1; }
33
34    // code "/2", return '1' if actual character (szString[pos]) is a number
35    int Func2(const char*const& szString, const int& pos, const char&, void*)
36    {  if(IsDigit(szString[pos])) return 1;
37       else return -1; }
38
39    // code "/3", return '1' if actual character (szString[pos]) is a number or whitespace
40    int Func3(const char*const& szString, const int& pos, const char&, void*)
41    {  if(IsDigit(szString[pos]) || IsSpace(szString[pos])) return 1;
42       else return -1; }
43
44    // code "/4", return '1' if actual character (szString[pos]) is an alpha
45    int Func4(const char*const& szString, const int& pos, const char&, void*)
46    {  if(IsAlpha(szString[pos])) return 1;
47       else return -1; }
48
49    // code "/5", return '1' if actual character (szString[pos]) is an alpha or whitespace
50    int Func5(const char*const& szString, const int& pos, const char&, void*)
51    {  if(IsAlpha(szString[pos]) || IsSpace(szString[pos])) return 1;
52       else return -1; }
53
54    // code "/6", return '1' if actual character (szString[pos]) is an alpha or number
55    int Func6(const char*const& szString, const int& pos, const char&, void*)
56    {  if(IsAlpha(szString[pos]) || IsDigit(szString[pos])) return 1;
57       else return -1; }
58
59    // code "/7", return '1' if actual character (szString[pos]) is an alpha or number or whitespace
60    int Func7(const char*const& szString, const int& pos, const char&, void*)
61    {  if(IsAlpha(szString[pos])|| IsDigit(szString[pos]) || IsSpace(szString[pos])) return 1;
62       else return -1; }
63
64    // return '1' if actual position (szString[pos]) matches specified character cChar
65    int SpecifiedChar(const char*const& szString, const int& pos, const char& cChar, void* func)
66    {  if(szString[pos]==cChar) return 1;
67       else return -1; }
68
69
70    //----------------------------------------------------------------------------------------------------------------------
71    // yes macros are evil :-))
72    #define ARBITRARY_FUNC_BODY(OneFunc)\
73        TFuncPtr ptr = NULL;\
74       if(func)                                        /* if passed ... */\
75          ptr = (TFuncPtr) func;                       /* ... cast from void* to TFuncPtr */\
76    \
77       int pos = _pos;\
78       while(szString[pos]!='\0')                      /* while end of string is not reached ... */\
79       {\
80          if(ptr)                                      /* if function pointer was passed ... */\
81          {\
82             if(ptr(szString, pos, cChar, func)>0)     /* ... use it to check if next element is matched ... */\
83                break;                                 /* ... and break if it does */\
84          }\
85    \
86          if(OneFunc(szString, pos, cChar, func)==1)   /* increment counter if actual character matches */\
87             pos++;\
88          else\
89             break;\
90       }\
91       return pos-_pos;                                /* return # of matching characters */
92
93
94    //----------------------------------------------------------------------------------------------------------------------
95    // The "arbitrary" functions (Code "\8" to "\F")
96
97    // code "/8", return # of matching characters
98    int Func8(const char*const& szString, const int& _pos, const char& cChar, void* func)
99    { ARBITRARY_FUNC_BODY(Func0); }
100
101   // code "/9", return # of matching characters
102   int Func9(const char*const& szString, const int& _pos, const char& cChar, void* func)
103   { ARBITRARY_FUNC_BODY(Func1); }
104
105   // code "/A", return # of matching characters
106   int FuncA(const char*const& szString, const int& _pos, const char& cChar, void* func)
107   { ARBITRARY_FUNC_BODY(Func2); }
108
109   // code "/B", return # of matching characters
110   int FuncB(const char*const& szString, const int& _pos, const char& cChar, void* func)
111   { ARBITRARY_FUNC_BODY(Func3); }
112
113   // code "/C", return # of matching characters
114   int FuncC(const char*const& szString, const int& _pos, const char& cChar, void* func)
115   { ARBITRARY_FUNC_BODY(Func4); }
116
117   // code "/D", return # of matching characters
118   int FuncD(const char*const& szString, const int& _pos, const char& cChar, void* func)
119   { ARBITRARY_FUNC_BODY(Func5); }
120
121   // code "/E", return # of matching characters
122   int FuncE(const char*const& szString, const int& _pos, const char& cChar, void* func)
123   { ARBITRARY_FUNC_BODY(Func6); }
124
125   // code "/F", return # of matching characters
126   int FuncF(const char*const& szString, const int& _pos, const char& cChar, void* func)
127   { ARBITRARY_FUNC_BODY(Func7); }
128
129
130
131   //----------------------------------------------------------------------------------------------------------------------
132   // constructor
133   TRegex::TRegex()
134   {
135      nFuncs=0;
136      func = NULL;
137      para = NULL;
138      f_Arbitrary = NULL;
139      szRegex[0] = '\0';
140   }
141
142
143   //----------------------------------------------------------------------------------------------------------------------
144   // destructor
145   TRegex::~TRegex()
146   {
147      // clear memory
148      delete[] func;
149      delete[] para;
150      delete[] f_Arbitrary;
151   }
152
153
154   //----------------------------------------------------------------------------------------------------------------------
155   // set regex
156   void TRegex::SetRegex(const char*const& _szRegex)
157   {
158      // 1. count number of regex elements, an element is either a backslash followed by a number or an arbitrary character
159      strcpy(szRegex, _szRegex);
160      int pos=0;                          // ini
161      nFuncs=0;
162      while(szRegex[pos]!='\0')           // while regex string is not at its end
163      {
164         if(szRegex[pos]=='\\')           // increment by one if character is escape sequence
165         {
166            pos++;
167            if(szRegex[pos]=='\0')        // and leave loop if end of string is reached
168               break;
169         }
170
171         pos++;
172         nFuncs++;
173      }
174
175
176      // 2. allocate memory for regex elements
177      func = new TFuncPtr[nFuncs+1];
178      para = new char[nFuncs+1];
179      f_Arbitrary = new bool[nFuncs+1];
180
181
182      // 3. parse regex string again to initialize regex element function pointers, terminating character and arbitrary flag
183      pos=0;                                 // ini
184      bool f_PreviousIsArbitrary = false;
185      for(int j=0;j<nFuncs;j++)              // for all expected regex elements
186      {
187         if(szRegex[pos]=='\\' && szRegex[pos+1]!='\\')
188
189         // a) current regex element is a code started with a backslash
190         {
191            pos++;                           // increment position in regex string
192            int no = szRegex[pos]-48;        // convert code number (char) from '0'-'9' to 0-9 and 'A'-'F' to 10-15
193            if(no>=17)
194               no-=7;
195
196            switch(no)                       // initialize depending on converted code number
197            {
198               case 0  : func[j] = &Func0; f_Arbitrary[j] = false; break;
199               case 1  : func[j] = &Func1; f_Arbitrary[j] = false; break;
200               case 2  : func[j] = &Func2; f_Arbitrary[j] = false; break;
201               case 3  : func[j] = &Func3; f_Arbitrary[j] = false; break;
202               case 4  : func[j] = &Func4; f_Arbitrary[j] = false; break;
203               case 5  : func[j] = &Func5; f_Arbitrary[j] = false; break;
204               case 6  : func[j] = &Func6; f_Arbitrary[j] = false; break;
205               case 7  : func[j] = &Func7; f_Arbitrary[j] = false; break;
206               case 8  : func[j] = &Func8; f_Arbitrary[j] = true; break;
207               case 9  : func[j] = &Func9; f_Arbitrary[j] = true; break;
208               case 10 : func[j] = &FuncA; f_Arbitrary[j] = true; break;
209               case 11 : func[j] = &FuncB; f_Arbitrary[j] = true; break;
210               case 12 : func[j] = &FuncC; f_Arbitrary[j] = true; break;
211               case 13 : func[j] = &FuncD; f_Arbitrary[j] = true; break;
212               case 14 : func[j] = &FuncE; f_Arbitrary[j] = true; break;
213               case 15 : func[j] = &FuncF; f_Arbitrary[j] = true; break;
214               default :
215                         // unknown code: throw exception
216                         char szText[256];
217                         sprintf(szText, "Unknown code '\\%c' in %d'th regex element!", szRegex[pos], j);
218                         throw TErrText(szText);
219            }
220            para[j] = '\0';                           // ini to '\0', may be re-initialized if element of type 'specified
221                                                      // character comes next
222            f_PreviousIsArbitrary = f_Arbitrary[j];   // set flag if element is of type 'arbitrary'
223         }
224
225         // b) current element is a specified character
226         else
227         {
228            if(szRegex[pos]=='\\')           // increment by one if it's the escape char
229               pos++;
230            func[j] = &SpecifiedChar;        // assign element's function
231            para[j] = szRegex[pos];          // store character
232            f_Arbitrary[j] = false;          //
233
234            if(f_PreviousIsArbitrary)        // if previous element was of type 'arbitrary' ...
235               para[j-1] = para[j];          // re-initialize
236            f_PreviousIsArbitrary = false;
237         }
238         pos++;   // increment position
239      }
240
241      // 4. initialize dummy elements at the end
242      func[nFuncs] = NULL;                   // set to NULL or '\0'
243      para[nFuncs] = '\0';
244      f_Arbitrary[nFuncs] = false;
245   }
246
247
248   //----------------------------------------------------------------------------------------------------------------------
249   // returns # of matching characters if passed string starts with regular expression that was passed in constructor,
250   // else returns 0
251   int TRegex::StartsWithRegex(const char*const& szString) const
252   {
253      // ini
254      int pos = 0;               // string position
255      int restore_pos = -1;      // restore information: string position and regex element
256      int restore_j = 0;         // note: used to jump back to last successfully processed element of type 'arbitrary'
257
258
259      // try to match all alements of regex one after the other
260      for(int j=0;j<nFuncs;j++)
261      {
262         int nChar = (func[j])(szString, pos, para[j], (void*) func[j+1]);    // get number of matching characters
263         if(nChar>=0)                                                         // if match found ...
264         {
265            pos += nChar;              // increment position by # matching characters
266
267            if(f_Arbitrary[j])         // if an regex element of type 'arbitrary' has been processed ...
268            {
269               restore_pos = pos;      // store actual string position
270               restore_j = j;          // and regex element ID to be able to "jump" back if matching fails in future
271            }
272         }
273
274         // no match found (negative value was returned)
275         else
276         {
277            // if no restore info exists or string is at its end ...
278            if(restore_pos<0 || szString[pos]=='\0')
279               return 0;                                 // no match found, return
280
281            // else: jump back to last element of type 'arbitrary'
282            else
283            {
284               // restore
285               pos = restore_pos;                                             // restore string position
286               j = restore_j;                                                 // restore regex element
287               restore_pos = -1;                                              // reset restore information
288
289               // temporarily terminate string directly after actual position
290               char*const _szString = const_cast <char*const> (szString);     // cast const away
291               char cTmp = szString[pos+1];                                   // remember original value and ...
292               _szString[pos+1] = '\0';                                       // ... temporarily terminate string
293
294               // call function of last element of type 'arbitrary' again to check if actual character matches
295               // note: the actual character is the character the last regex element of type 'arbitrary' was
296               // terminated with.
297               if((func[j])(szString, pos, '\0', NULL)>0)
298               {
299                  // match: exactly one character has been checked. Thus continue parsing with next character and actual
300                  //        regex element
301                  j--;                       // decrement cause will be incremented again by loop
302                  _szString[pos+1] = cTmp;   // restore string
303                  pos++;                     // increment string position
304                  continue;                  // proceed in loop   note: obsolete, just to clarify
305               }
306               else
307               {
308                  // no match: retry has failed, abort
309                  _szString[pos+1] = cTmp;   // restore string
310                  return 0;                  // no match found
311               }
312            }
313         }
314      }
315
316      return pos;    // return number of matching characters
317   }

Top