cgv
Loading...
Searching...
No Matches
tokenizer.cxx
1#include "tokenizer.h"
2#include "scan.h"
3
4namespace cgv {
5 namespace utils {
6
7void tokenizer::init()
8{
9 separators = "";
10 merge_separators = false;
11 begin_skip = "";
12 end_skip = "";
13 whitespaces = " \t\n";
14}
15
17{
18 init();
19}
20
22{
23 init();
24}
25
27{
28 begin = p;
29 end = p + std::string(p).size();
30 init();
31}
32
33tokenizer::tokenizer(const std::string& s) : token(s)
34{
35 init();
36}
37
38tokenizer& tokenizer::set_ws(const std::string& ws)
39{
40 whitespaces = ws;
41 return *this;
42}
43
44tokenizer& tokenizer::set_skip(const std::string& open, const std::string& close)
45{
46 begin_skip = open;
47 end_skip = close;
48 return *this;
49}
50
51tokenizer& tokenizer::set_skip(const std::string& open, const std::string& close, const std::string& escape)
52{
53 begin_skip = open;
54 end_skip = close;
55 escape_skip = escape;
56 return *this;
57}
58
59tokenizer& tokenizer::set_sep(const std::string& sep, bool merge)
60{
61 separators = sep;
62 merge_separators = merge;
63 return *this;
64}
65
66tokenizer& tokenizer::set_sep(const std::string& sep)
67{
68 separators = sep;
69 return *this;
70}
71
73{
74 merge_separators = merge;
75 return *this;
76}
77
78bool tokenizer::handle_skip(token& result)
79{
80 if (result.end >= end)
81 return false;
82 std::size_t i = begin_skip.find_first_of(*result.end);
83 if (i == std::string::npos)
84 return false;
85 ++result.end;
86 if (escape_skip.size() > i) {
87 bool last_was_escape = false;
88 while (result.end < end) {
89 if (last_was_escape)
90 last_was_escape = false;
91 else if (*result.end == escape_skip[i])
92 last_was_escape = true;
93 else if (*result.end == end_skip[i])
94 break;
95 ++result.end;
96 }
97 }
98 else {
99 while (result.end < end) {
100 if (*result.end == end_skip[i])
101 break;
102 ++result.end;
103 }
104 }
105 if (is_element(end_skip[i], whitespaces))
106 --result.end;
107 return true;
108}
109
110bool tokenizer::reverse_handle_skip(token& result)
111{
112 if (result.begin <= begin)
113 return false;
114 std::size_t i = end_skip.find_first_of(*(result.begin-1));
115 if (i == std::string::npos)
116 return false;
117 --result.begin;
118 if (escape_skip.size() > i) {
119 while (result.begin > begin) {
120 if (*(result.begin-1) == begin_skip[i]) {
121 unsigned nr_escape = 0;
122 while (result.begin-nr_escape-1 > begin && *(result.begin-nr_escape-2) == escape_skip[i])
123 ++nr_escape;
124 if ((nr_escape & 1) == 1)
125 break;
126 }
127 --result.begin;
128 }
129 }
130 else {
131 while (result.begin > begin) {
132 if (*(result.begin-1) == begin_skip[i])
133 break;
134 --result.begin;
135 }
136 }
137 if (is_element(begin_skip[i], whitespaces))
138 ++result.end;
139 return true;
140}
141
142bool tokenizer::handle_separators(token& result, bool check_skip)
143{
144 // handle separator tokens
145 bool did_skip = false;
146 if (check_skip)
147 did_skip = handle_skip(result);
148 if (result.end < end && is_element(*result.end, separators)) {
149 if (merge_separators && !did_skip) {
150 skip(separators);
151 result.end = begin;
152 }
153 else
154 begin = result.end = result.end+1;
155 return true;
156 }
157 return false;
158}
159
160bool tokenizer::reverse_handle_separators(token& result, bool check_skip)
161{
162 // handle separator tokens
163 bool did_skip = false;
164 if (check_skip)
165 did_skip = reverse_handle_skip(result);
166
167 if (result.begin > begin && is_element(*(result.begin-1), separators)) {
168 if (merge_separators && !did_skip) {
169 reverse_skip(separators);
170 end = result.begin;
171 }
172 else
173 end = result.begin = result.begin+1;
174 return true;
175 }
176 return false;
177}
178
180{
181 // handle whitespaces
183 token result(begin, begin);
184
185 if (handle_separators(result)) {
186 begin = result.end;
187 return result;
188 }
189 if (result.end == end) {
190 begin = result.end;
191 return result;
192 }
193 // merge non separator characters
194 while (++result.end < end) {
195 // handle skip characters
196 const char* tmp_end = result.end;
197 if (handle_skip(result) && result.end == end)
198 break;
199 if (result.end < end &&
200 ( is_element(*result.end, separators) ||
201 is_element(*result.end, whitespaces) ) ) {
202 begin = result.end = tmp_end;
203 return result;
204 }
205 }
206 begin = result.end;
207 return result;
208}
209
211{
212 // handle whitespaces
214 token result(end, end);
215
216 if (reverse_handle_separators(result))
217 return result;
218 if (result.begin == begin)
219 return result;
220 // merge non separator characters
221 while (--result.begin > begin) {
222 // handle skip characters
223 const char* tmp_begin = result.begin;
224 reverse_handle_skip(result);
225 if (result.begin > begin &&
226 ( is_element(*(result.begin-1), separators) ||
227 is_element(*(result.begin-1), whitespaces) ) ) {
228 end = result.begin = tmp_begin;
229 return result;
230 }
231 }
232 end = result.begin;
233 return result;
234}
235
236bool tokenizer::balanced_bite(token& result, const std::string& open_parenthesis, const std::string& close_parenthesis, bool wait_for_sep)
237{
238 // count the nesting level of all parentheses
239 std::vector<int> nesting;
240 nesting.resize(open_parenthesis.size());
241 std::fill(nesting.begin(),nesting.end(),0);
242 int nr_nested = 0;
243 // handle whitespaces
245 result = token(begin, begin);
246
247 if (handle_separators(result, false))
248 return true;
249 // remember whether we are inside a token
250 bool inside_token = true;
251 // merge non separator characters
252 while (result.end < end) {
253 // handle parenthesis
254 std::size_t i_close = close_parenthesis.find_first_of(*result.end);
255 std::size_t i_open = open_parenthesis.find_first_of(*result.end);
256 if (i_close != std::string::npos || i_open != std::string::npos) {
257 if (!wait_for_sep)
258 inside_token = false;
259 // first handle case when open and close parentheses are identical
260 if (i_open == i_close) {
261 if (nesting[i_open] == 0) {
262 ++nesting[i_open];
263 ++nr_nested;
264 }
265 else {
266 if (--nesting[i_open] == 0)
267 --nr_nested;
268 }
269 }
270 else if (i_close != std::string::npos) {
271 if (nesting[i_close] == 0)
272 return false;
273 if (--nesting[i_close] == 0)
274 --nr_nested;
275 }
276 else {
277 if (++nesting[i_open] == 1)
278 ++nr_nested;
279 }
280 }
281 else {
282 // handle skip characters
283 const char* tmp_end = result.end;
284 handle_skip(result);
285 if (is_element(*result.end, separators) ||
286 (is_element(*result.end, whitespaces) && nr_nested == 0) ) {
287 begin = result.end = tmp_end;
288 return nr_nested == 0;
289 }
290 }
291 ++result.end;
292 if (!inside_token && nr_nested == 0)
293 break;
294 }
295 if (nr_nested == 0) {
296 begin = result.end;
297 return true;
298 }
299 return false;
300}
301
303{
304 skip(whitespaces);
305}
306
308{
309 reverse_skip(whitespaces);
310}
311
313void tokenizer::bite_all(std::vector<token>& result)
314{
315 while(!skip_ws_check_empty())
316 result.push_back(bite());
317}
318
319 }
320}
the tokenizer allows to split text into tokens in a convenient way.
Definition tokenizer.h:68
void skip_whitespaces()
skip whitespaces at the front
void reverse_skip_whitespaces()
skip whitespaces at the back
bool balanced_bite(token &result, const std::string &open_parenthesis, const std::string &close_parenthesis, bool wait_for_sep=false)
bite one token until all potentially nested opended parenthesis have been closed again
tokenizer & set_sep(const std::string &sep, bool merge)
set the list of separators and specify whether succeeding separators are merged into single tokens
Definition tokenizer.cxx:59
tokenizer & set_sep_merge(bool merge)
specify whether succeeding separators are merged into single tokens
Definition tokenizer.cxx:72
tokenizer & set_ws(const std::string &ws)
set the list of white spaces, that separate tokens and are skipped
Definition tokenizer.cxx:38
bool skip_ws_check_empty()
skip whitespaces at the front and return whether the complete text has been processed
Definition tokenizer.h:111
token bite()
bite away a single token from the front
token reverse_bite()
bite away a single token from the back
tokenizer()
construct empty tokenizer
Definition tokenizer.cxx:16
tokenizer & set_skip(const std::string &open, const std::string &close)
set several character pairs that enclose tokens that are not split
Definition tokenizer.cxx:44
bool is_element(char c, const std::string &s)
check if char c arises in string s
Definition scan.cxx:291
the cgv namespace
Definition print.h:11
Helper functions to process strings.
representation of a token in a text by two pointers begin and end, that point to the first character ...
Definition token.h:18
void skip(const std::string &skip_chars)
set begin by skipping all instances of the given character set
Definition token.cxx:33
const char * begin
pointers that define the range of characters
Definition token.h:20
token()
construct with both pointers set to 0
Definition token.cxx:8
void reverse_skip(const std::string &skip_chars)
set end by skipping all instances of the given character set
Definition token.cxx:38