Back
1 //------------------------------------------------------------------------------
2 // Module: Regex.cpp //
3 // //
4 // Class which encapsulates a very simple regular expression parser //
5 // //
6 // Copyright (c) 2000-2004 by Lars Haendel //
7 // Home: http://www.newty.de //
8 // //
9 // This program is free software and can be used under the terms of the //
10 // GNU licence. See header-file for further information and disclaimer. //
11 // //
12 //------------------------------------------------------------------------------
13
14 #include <stdio> // sprintf()
15
16 #include "Regex.h"
17 #include "NameUtil.h" // IsAlpha() etc.
18 #include "ErrText.h" // TErrText
19
20
21 //----------------------------------------------------------------------------------------------------------------------
22 // The "one character" functions (Code "\0" to "\7")
23
24 // code "/0", return '1' if actual character (szString[pos]) is an arbitrary character
25 int Func0(const char*const& szString, const int& pos, const char&, void*)
26 { if(szString[pos]!='\0') return 1;
27 else return -1; }
28
29 // code "/1", return '1' if actual character (szString[pos]) is a whitespace
30 int Func1(const char*const& szString, const int& pos, const char&, void*)
31 { if(IsSpace(szString[pos])) return 1;
32 else return -1; }
33
34 // code "/2", return '1' if actual character (szString[pos]) is a number
35 int Func2(const char*const& szString, const int& pos, const char&, void*)
36 { if(IsDigit(szString[pos])) return 1;
37 else return -1; }
38
39 // code "/3", return '1' if actual character (szString[pos]) is a number or whitespace
40 int Func3(const char*const& szString, const int& pos, const char&, void*)
41 { if(IsDigit(szString[pos]) || IsSpace(szString[pos])) return 1;
42 else return -1; }
43
44 // code "/4", return '1' if actual character (szString[pos]) is an alpha
45 int Func4(const char*const& szString, const int& pos, const char&, void*)
46 { if(IsAlpha(szString[pos])) return 1;
47 else return -1; }
48
49 // code "/5", return '1' if actual character (szString[pos]) is an alpha or whitespace
50 int Func5(const char*const& szString, const int& pos, const char&, void*)
51 { if(IsAlpha(szString[pos]) || IsSpace(szString[pos])) return 1;
52 else return -1; }
53
54 // code "/6", return '1' if actual character (szString[pos]) is an alpha or number
55 int Func6(const char*const& szString, const int& pos, const char&, void*)
56 { if(IsAlpha(szString[pos]) || IsDigit(szString[pos])) return 1;
57 else return -1; }
58
59 // code "/7", return '1' if actual character (szString[pos]) is an alpha or number or whitespace
60 int Func7(const char*const& szString, const int& pos, const char&, void*)
61 { if(IsAlpha(szString[pos])|| IsDigit(szString[pos]) || IsSpace(szString[pos])) return 1;
62 else return -1; }
63
64 // return '1' if actual position (szString[pos]) matches specified character cChar
65 int SpecifiedChar(const char*const& szString, const int& pos, const char& cChar, void* func)
66 { if(szString[pos]==cChar) return 1;
67 else return -1; }
68
69
70 //----------------------------------------------------------------------------------------------------------------------
71 // yes macros are evil :-))
72 #define ARBITRARY_FUNC_BODY(OneFunc)\
73 TFuncPtr ptr = NULL;\
74 if(func) /* if passed ... */\
75 ptr = (TFuncPtr) func; /* ... cast from void* to TFuncPtr */\
76 \
77 int pos = _pos;\
78 while(szString[pos]!='\0') /* while end of string is not reached ... */\
79 {\
80 if(ptr) /* if function pointer was passed ... */\
81 {\
82 if(ptr(szString, pos, cChar, func)>0) /* ... use it to check if next element is matched ... */\
83 break; /* ... and break if it does */\
84 }\
85 \
86 if(OneFunc(szString, pos, cChar, func)==1) /* increment counter if actual character matches */\
87 pos++;\
88 else\
89 break;\
90 }\
91 return pos-_pos; /* return # of matching characters */
92
93
94 //----------------------------------------------------------------------------------------------------------------------
95 // The "arbitrary" functions (Code "\8" to "\F")
96
97 // code "/8", return # of matching characters
98 int Func8(const char*const& szString, const int& _pos, const char& cChar, void* func)
99 { ARBITRARY_FUNC_BODY(Func0); }
100
101 // code "/9", return # of matching characters
102 int Func9(const char*const& szString, const int& _pos, const char& cChar, void* func)
103 { ARBITRARY_FUNC_BODY(Func1); }
104
105 // code "/A", return # of matching characters
106 int FuncA(const char*const& szString, const int& _pos, const char& cChar, void* func)
107 { ARBITRARY_FUNC_BODY(Func2); }
108
109 // code "/B", return # of matching characters
110 int FuncB(const char*const& szString, const int& _pos, const char& cChar, void* func)
111 { ARBITRARY_FUNC_BODY(Func3); }
112
113 // code "/C", return # of matching characters
114 int FuncC(const char*const& szString, const int& _pos, const char& cChar, void* func)
115 { ARBITRARY_FUNC_BODY(Func4); }
116
117 // code "/D", return # of matching characters
118 int FuncD(const char*const& szString, const int& _pos, const char& cChar, void* func)
119 { ARBITRARY_FUNC_BODY(Func5); }
120
121 // code "/E", return # of matching characters
122 int FuncE(const char*const& szString, const int& _pos, const char& cChar, void* func)
123 { ARBITRARY_FUNC_BODY(Func6); }
124
125 // code "/F", return # of matching characters
126 int FuncF(const char*const& szString, const int& _pos, const char& cChar, void* func)
127 { ARBITRARY_FUNC_BODY(Func7); }
128
129
130
131 //----------------------------------------------------------------------------------------------------------------------
132 // constructor
133 TRegex::TRegex()
134 {
135 nFuncs=0;
136 func = NULL;
137 para = NULL;
138 f_Arbitrary = NULL;
139 szRegex[0] = '\0';
140 }
141
142
143 //----------------------------------------------------------------------------------------------------------------------
144 // destructor
145 TRegex::~TRegex()
146 {
147 // clear memory
148 delete[] func;
149 delete[] para;
150 delete[] f_Arbitrary;
151 }
152
153
154 //----------------------------------------------------------------------------------------------------------------------
155 // set regex
156 void TRegex::SetRegex(const char*const& _szRegex)
157 {
158 // 1. count number of regex elements, an element is either a backslash followed by a number or an arbitrary character
159 strcpy(szRegex, _szRegex);
160 int pos=0; // ini
161 nFuncs=0;
162 while(szRegex[pos]!='\0') // while regex string is not at its end
163 {
164 if(szRegex[pos]=='\\') // increment by one if character is escape sequence
165 {
166 pos++;
167 if(szRegex[pos]=='\0') // and leave loop if end of string is reached
168 break;
169 }
170
171 pos++;
172 nFuncs++;
173 }
174
175
176 // 2. allocate memory for regex elements
177 func = new TFuncPtr[nFuncs+1];
178 para = new char[nFuncs+1];
179 f_Arbitrary = new bool[nFuncs+1];
180
181
182 // 3. parse regex string again to initialize regex element function pointers, terminating character and arbitrary flag
183 pos=0; // ini
184 bool f_PreviousIsArbitrary = false;
185 for(int j=0;j<nFuncs;j++) // for all expected regex elements
186 {
187 if(szRegex[pos]=='\\' && szRegex[pos+1]!='\\')
188
189 // a) current regex element is a code started with a backslash
190 {
191 pos++; // increment position in regex string
192 int no = szRegex[pos]-48; // convert code number (char) from '0'-'9' to 0-9 and 'A'-'F' to 10-15
193 if(no>=17)
194 no-=7;
195
196 switch(no) // initialize depending on converted code number
197 {
198 case 0 : func[j] = &Func0; f_Arbitrary[j] = false; break;
199 case 1 : func[j] = &Func1; f_Arbitrary[j] = false; break;
200 case 2 : func[j] = &Func2; f_Arbitrary[j] = false; break;
201 case 3 : func[j] = &Func3; f_Arbitrary[j] = false; break;
202 case 4 : func[j] = &Func4; f_Arbitrary[j] = false; break;
203 case 5 : func[j] = &Func5; f_Arbitrary[j] = false; break;
204 case 6 : func[j] = &Func6; f_Arbitrary[j] = false; break;
205 case 7 : func[j] = &Func7; f_Arbitrary[j] = false; break;
206 case 8 : func[j] = &Func8; f_Arbitrary[j] = true; break;
207 case 9 : func[j] = &Func9; f_Arbitrary[j] = true; break;
208 case 10 : func[j] = &FuncA; f_Arbitrary[j] = true; break;
209 case 11 : func[j] = &FuncB; f_Arbitrary[j] = true; break;
210 case 12 : func[j] = &FuncC; f_Arbitrary[j] = true; break;
211 case 13 : func[j] = &FuncD; f_Arbitrary[j] = true; break;
212 case 14 : func[j] = &FuncE; f_Arbitrary[j] = true; break;
213 case 15 : func[j] = &FuncF; f_Arbitrary[j] = true; break;
214 default :
215 // unknown code: throw exception
216 char szText[256];
217 sprintf(szText, "Unknown code '\\%c' in %d'th regex element!", szRegex[pos], j);
218 throw TErrText(szText);
219 }
220 para[j] = '\0'; // ini to '\0', may be re-initialized if element of type 'specified
221 // character comes next
222 f_PreviousIsArbitrary = f_Arbitrary[j]; // set flag if element is of type 'arbitrary'
223 }
224
225 // b) current element is a specified character
226 else
227 {
228 if(szRegex[pos]=='\\') // increment by one if it's the escape char
229 pos++;
230 func[j] = &SpecifiedChar; // assign element's function
231 para[j] = szRegex[pos]; // store character
232 f_Arbitrary[j] = false; //
233
234 if(f_PreviousIsArbitrary) // if previous element was of type 'arbitrary' ...
235 para[j-1] = para[j]; // re-initialize
236 f_PreviousIsArbitrary = false;
237 }
238 pos++; // increment position
239 }
240
241 // 4. initialize dummy elements at the end
242 func[nFuncs] = NULL; // set to NULL or '\0'
243 para[nFuncs] = '\0';
244 f_Arbitrary[nFuncs] = false;
245 }
246
247
248 //----------------------------------------------------------------------------------------------------------------------
249 // returns # of matching characters if passed string starts with regular expression that was passed in constructor,
250 // else returns 0
251 int TRegex::StartsWithRegex(const char*const& szString) const
252 {
253 // ini
254 int pos = 0; // string position
255 int restore_pos = -1; // restore information: string position and regex element
256 int restore_j = 0; // note: used to jump back to last successfully processed element of type 'arbitrary'
257
258
259 // try to match all alements of regex one after the other
260 for(int j=0;j<nFuncs;j++)
261 {
262 int nChar = (func[j])(szString, pos, para[j], (void*) func[j+1]); // get number of matching characters
263 if(nChar>=0) // if match found ...
264 {
265 pos += nChar; // increment position by # matching characters
266
267 if(f_Arbitrary[j]) // if an regex element of type 'arbitrary' has been processed ...
268 {
269 restore_pos = pos; // store actual string position
270 restore_j = j; // and regex element ID to be able to "jump" back if matching fails in future
271 }
272 }
273
274 // no match found (negative value was returned)
275 else
276 {
277 // if no restore info exists or string is at its end ...
278 if(restore_pos<0 || szString[pos]=='\0')
279 return 0; // no match found, return
280
281 // else: jump back to last element of type 'arbitrary'
282 else
283 {
284 // restore
285 pos = restore_pos; // restore string position
286 j = restore_j; // restore regex element
287 restore_pos = -1; // reset restore information
288
289 // temporarily terminate string directly after actual position
290 char*const _szString = const_cast <char*const> (szString); // cast const away
291 char cTmp = szString[pos+1]; // remember original value and ...
292 _szString[pos+1] = '\0'; // ... temporarily terminate string
293
294 // call function of last element of type 'arbitrary' again to check if actual character matches
295 // note: the actual character is the character the last regex element of type 'arbitrary' was
296 // terminated with.
297 if((func[j])(szString, pos, '\0', NULL)>0)
298 {
299 // match: exactly one character has been checked. Thus continue parsing with next character and actual
300 // regex element
301 j--; // decrement cause will be incremented again by loop
302 _szString[pos+1] = cTmp; // restore string
303 pos++; // increment string position
304 continue; // proceed in loop note: obsolete, just to clarify
305 }
306 else
307 {
308 // no match: retry has failed, abort
309 _szString[pos+1] = cTmp; // restore string
310 return 0; // no match found
311 }
312 }
313 }
314 }
315
316 return pos; // return number of matching characters
317 }
Top |