dietc.lexer source code

1 module dietc.lexer;
2 
3 import std.algorithm;
4 import std.ascii;
5 import std.conv;
6 import std..string;
7 import std.utf;
8 
9 enum TokenType
10 {
11 	raw,
12 	indent,
13 	detent,
14 	identifier,
15 	code,
16 	newline,
17 	whitespace,
18 	eof
19 }
20 
21 struct Token
22 {
23 	TokenType type;
24 	string content;
25 	size_t[2] range;
26 }
27 
28 struct ErrorContext
29 {
30 	struct Error
31 	{
32 		string file;
33 		size_t at;
34 		string message;
35 	}
36 
37 	Error[] errors;
38 	alias errors this;
39 
40 	string formatMessage(ref DietInput input, size_t at, string s)
41 	{
42 		auto pos = input.code.bytesToPosition(at);
43 		return input.file ~ "(" ~ to!string(pos[0] + 1) ~ ":" ~ to!string(pos[1] + 1) ~ "): " ~ s;
44 	}
45 
46 	void error(ref DietInput input, size_t at, string s)
47 	{
48 		errors ~= Error(input.file, at, formatMessage(input, at, s));
49 	}
50 
51 	void expect(ref DietInput input, size_t at, string expectation,
52 			string srcfile = __FILE__, size_t srcline = __LINE__)
53 	{
54 		foreach (ref error; errors)
55 		{
56 			if (error.file == input.file && error.at == at && error.message.startsWith("Expected "))
57 			{
58 				error.message ~= ", " ~ expectation;
59 				return;
60 			}
61 		}
62 		string prefix;
63 		debug prefix = "(src=" ~ srcfile ~ ":" ~ srcline.to!string ~ ") ";
64 		errors ~= Error(input.file, at, prefix ~ formatMessage(input, at, "Expected " ~ expectation));
65 	}
66 }
67 
68 /// zero-based [line, column]
69 alias Position = size_t[2];
70 
71 Position bytesToPosition(string code, size_t bytes)
72 {
73 	size_t line, column;
74 	bool wasCR;
75 	size_t i;
76 	while (code.length)
77 	{
78 		if (i >= bytes)
79 			break;
80 		size_t len;
81 		immutable c = code.decodeFront(len);
82 		i += len;
83 		if (c == '\r')
84 		{
85 			wasCR = true;
86 		}
87 		else if (c == '\n')
88 		{
89 			line++;
90 			column = 0;
91 			wasCR = false;
92 		}
93 		else
94 		{
95 			if (wasCR)
96 			{
97 				line++;
98 				column = 0;
99 			}
100 			wasCR = false;
101 		}
102 	}
103 	if (wasCR)
104 	{
105 		line++;
106 		column = 0;
107 	}
108 	return [line, column];
109 }
110 
111 enum IndentStyle
112 {
113 	unknown,
114 	spaces,
115 	tabs
116 }
117 
118 struct DietInput
119 {
120 	string file;
121 	string code;
122 	size_t index;
123 	int tabSize = 4;
124 	size_t[] indentation;
125 	IndentStyle indentationStyle;
126 	Token last;
127 	Token[] backlog;
128 	bool lastWasNewline = true;
129 
130 	static DietInput fromFile(R)(R file)
131 	{
132 		import std.file : readText;
133 
134 		DietInput ret;
135 		ret.code = readText(file);
136 		ret.file = file.to!string;
137 		return ret;
138 	}
139 
140 	ErrorContext errors;
141 
142 	size_t determineIndentation(string whitespace, out bool error)
143 	{
144 		assert(whitespace.byDchar.all!isWhite);
145 		size_t indentation; // @suppress(dscanner.suspicious.label_var_same_name)
146 		foreach (c; whitespace.byDchar)
147 		{
148 			if (c == '\t')
149 			{
150 				if (indentationStyle == IndentStyle.unknown)
151 					indentationStyle = IndentStyle.tabs;
152 				else if (indentationStyle == IndentStyle.spaces)
153 					error = true;
154 				indentation = (indentation / tabSize + 1) * tabSize;
155 			}
156 			else
157 			{
158 				if (indentationStyle == IndentStyle.unknown)
159 					indentationStyle = IndentStyle.spaces;
160 				else if (indentationStyle == IndentStyle.tabs)
161 					error = true;
162 				indentation++;
163 			}
164 		}
165 		return indentation;
166 	}
167 
168 	string read(size_t[2] range)
169 	{
170 		if (range[1] < range[0])
171 			return null;
172 		if (range[0] < 0)
173 			range[0] = 0;
174 		if (range[1] > code.length)
175 			range[1] = code.length;
176 		return code[range[0] .. range[1]];
177 	}
178 
179 	void reset()
180 	{
181 		index = 0;
182 		indentation.length = 0;
183 		backlog.length = 0;
184 		last = Token.init;
185 		indentationStyle = IndentStyle.unknown;
186 		lastWasNewline = true;
187 	}
188 
189 	/// Checks if the current token matches type & optional match.
190 	bool peek(TokenType type, string match = null)
191 	{
192 		auto t = front();
193 		if (t.type != type)
194 			return false;
195 		if (match !is null)
196 			return t.content == match;
197 		return !empty || type == TokenType.eof;
198 	}
199 
200 	/// Does a peek and advances a token.
201 	bool match(TokenType type, string match = null)
202 	{
203 		auto ret = peek(type, match);
204 		popFront();
205 		return ret;
206 	}
207 
208 	/// Does a peek and advances a token and adds an error if it doesn't match.
209 	bool expect(TokenType type, string match = null, string srcfile = __FILE__,
210 			size_t srcline = __LINE__)
211 	{
212 		size_t at = index;
213 		Token tok = front;
214 		auto ret = this.match(type, match);
215 		if (!ret)
216 			errors.expect(this, at, (match.length ? "'" ~ match ~ "'"
217 					: type.to!string) ~ ", but got " ~ tok.to!string, srcfile, srcline);
218 		return ret;
219 	}
220 
221 	size_t skipAll(TokenType[] types...)
222 	{
223 		size_t n;
224 		while (types.canFind(front.type))
225 		{
226 			popFront();
227 			n++;
228 		}
229 		return n;
230 	}
231 
232 	size_t[] skipAllCount(TokenType[] types...)
233 	{
234 		size_t[] n = new size_t[](types.length);
235 		while (true)
236 		{
237 			auto i = types.countUntil(front.type);
238 			if (i < 0)
239 				break;
240 			popFront();
241 			n[i]++;
242 		}
243 		return n;
244 	}
245 
246 	auto save()
247 	{
248 		auto copy = this;
249 		copy.indentation = indentation.dup;
250 		copy.backlog = backlog.dup;
251 		copy.errors = errors.dup;
252 		return copy;
253 	}
254 
255 	void popFront()
256 	{
257 		if (backlog.length)
258 		{
259 			lastWasNewline = backlog[$ - 1].type == TokenType.newline;
260 			backlog.length--;
261 		}
262 		else if (index >= code.length)
263 		{
264 			index++;
265 			if (index > code.length + 100)
266 				throw new Exception("Attempted to read past EOF too often");
267 		}
268 	}
269 
270 	bool empty() @property
271 	{
272 		return index >= code.length && backlog.length == 0 && indentation.length == 0;
273 	}
274 
275 	auto front() @property
276 	{
277 		if (index >= code.length && backlog.length == 0 && indentation.length > 0)
278 		{
279 			if (indentation.length)
280 			{
281 				backlog.length = indentation.length + 1;
282 				backlog[0] = Token(TokenType.newline, null, [index, index]);
283 				foreach (i, indent; indentation)
284 					backlog[i + 1] = Token(TokenType.detent, null, [index, index]);
285 				backlog.reverse();
286 			}
287 			else
288 			{
289 				backlog = null;
290 			}
291 			indentation.length = 0;
292 		}
293 		if (index >= code.length && backlog.length == 0)
294 			return Token(TokenType.eof, null, [index, index]);
295 		else
296 		{
297 			while (!backlog.length)
298 			{
299 				backlog = parse();
300 				backlog.reverse();
301 			}
302 			return last = backlog[$ - 1];
303 		}
304 	}
305 
306 	private Token[] parse()
307 	{
308 		const size_t start = index;
309 		if (index >= code.length)
310 		{
311 			Token[] ret;
312 			foreach_reverse (indent; indentation)
313 			{
314 				ret ~= Token(TokenType.detent, null, [start, index]);
315 				indentation.length--;
316 			}
317 			ret ~= Token(TokenType.eof, null, [start, index]);
318 			return ret;
319 		}
320 		size_t dummy = index;
321 		auto c = decode(code, dummy);
322 		const size_t cLength = dummy - index;
323 
324 		// skip start of file whitespace
325 		if (index == 0 && c.isWhite)
326 		{
327 			size_t prev;
328 			while (c.isWhite)
329 			{
330 				prev = index;
331 				c = decode(code, index);
332 			}
333 			index = prev;
334 			return [];
335 		}
336 
337 		if (c == '\r')
338 		{
339 			index++;
340 			if (index < code.length && code[index] == '\n')
341 				index++;
342 			return [Token(TokenType.newline, code[start .. index], [start, index])];
343 		}
344 		else if (c == '\n')
345 		{
346 			index++;
347 			return [Token(TokenType.newline, code[start .. index], [start, index])];
348 		}
349 		else if (c.isWhiteButNotNewline)
350 		{
351 			const isIndent = last.type == TokenType.newline;
352 			string data = code[start .. $];
353 			bool uselessWhitespace;
354 			while (data.length)
355 			{
356 				size_t len;
357 				c = data.decodeFront(len);
358 				if (c == '\r' || c == '\n')
359 					uselessWhitespace = true;
360 				if (!c.isWhiteButNotNewline)
361 					break;
362 				index += len;
363 			}
364 			assert(start != index);
365 			if (uselessWhitespace && lastWasNewline)
366 				return [];
367 			if (isIndent)
368 			{
369 				bool error;
370 				auto level = determineIndentation(code[start .. index], error);
371 				if (error)
372 					errors.error(this, start, "Mixing spaces and tabs indentation");
373 				Token[] ret;
374 				if (!indentation.length || indentation[$ - 1] < level)
375 				{
376 					ret ~= Token(TokenType.indent, code[start .. index], [start, index]);
377 					indentation ~= level;
378 				}
379 				else
380 				{
381 					foreach_reverse (indent; indentation)
382 					{
383 						if (indent <= level)
384 							break;
385 						ret ~= Token(TokenType.detent, code[start .. index], [start, index]);
386 						indentation.length--;
387 					}
388 				}
389 				return ret;
390 			}
391 			else
392 				return [Token(TokenType.whitespace, code[start .. index], [start, index])];
393 		}
394 		else if (c.tagIdentifierValidator)
395 		{
396 			string data = code[start .. $];
397 			while (data.length)
398 			{
399 				size_t len;
400 				c = data.decodeFront(len);
401 				if (!c.tagIdentifierValidator)
402 					break;
403 				index += len;
404 			}
405 			return [Token(TokenType.identifier, code[start .. index], [start, index])];
406 		}
407 		else
408 		{
409 			index += cLength;
410 			return [Token(TokenType.raw, code[start .. index], [start, index])];
411 		}
412 	}
413 }
414 
415 /// Advances the input and returns true in case the tokens start with this match.
416 /// Warning: on match this will start a next token if it contains part of the match, discarding it basically.
417 bool matchText(ref DietInput input, string match)
418 {
419 	auto past = input.save();
420 	size_t i = 0;
421 	while (i < match.length && !input.empty)
422 	{
423 		auto p = input.front;
424 		input.popFront();
425 		if (match.length - i >= p.content.length)
426 		{
427 			if (match[i .. i += p.content.length] != p.content)
428 			{
429 				input = past;
430 				return false;
431 			}
432 		}
433 		else
434 		{
435 			if (match[i .. $] != p.content[0 .. match.length - i])
436 			{
437 				input = past;
438 				return false;
439 			}
440 			i = match.length;
441 		}
442 	}
443 	return i >= match.length;
444 }
445 
446 bool isWhiteButNotNewline(dchar c)
447 {
448 	return c.isWhite && c != '\n' && c != '\r';
449 }
450 
451 alias tagIdentifierValidator = (c) => c == '-' || c == ':' || c == '_' || c.isAlphaNum;
452 /// [-:_0-9a-zA-Z]+
453 bool validateTagIdentifier(string s)
454 {
455 	return s.byDchar.all!tagIdentifierValidator;
456 }
457 
458 /// [-_0-9a-zA-Z]+
459 bool validateIdentifier(string s)
460 {
461 	return s.byDchar.all!(c => c == '-' || c == '_' || c.isAlphaNum);
462 }
463 
464 /// [-_a-zA-Z] [-_0-9a-zA-Z]*
465 bool validateIdentifierAlpha(string s)
466 {
467 	bool first = true;
468 	return s.byDchar.all!((c) {
469 		auto ret = c == '-' || c == '_' || c.isAlpha || (!first && c.isDigit);
470 		first = false;
471 		return ret;
472 	});
473 }
474 
475 string indent(string s, string indentation = "\t")
476 {
477 	return s.lineSplitter!(KeepTerminator.yes)
478 		.map!(a => a.strip.length ? indentation ~ a : a)
479 		.join("");
480 }