dietc.lexer source code

1 module dietc.lexer;
2 
3 import std.algorithm;
4 import std.ascii;
5 import std.conv;
6 import std.string;
7 import std.utf;
8 
9 enum TokenType
10 {
11 	raw,
12 	indent,
13 	detent,
14 	identifier,
15 	code,
16 	newline,
17 	whitespace,
18 	eof
19 }
20 
21 struct Token
22 {
23 	TokenType type;
24 	string content;
25 	size_t[2] range;
26 }
27 
28 struct ErrorContext
29 {
30 	struct Error
31 	{
32 		string file;
33 		size_t at;
34 		string message;
35 	}
36 
37 	Error[] errors;
38 	alias errors this;
39 
40 	string formatMessage(ref DietInput input, size_t at, string s)
41 	{
42 		auto pos = input.code.bytesToPosition(at);
43 		return input.file ~ "(" ~ to!string(pos[0] + 1) ~ ":" ~ to!string(pos[1] + 1) ~ "): " ~ s;
44 	}
45 
46 	void error(ref DietInput input, size_t at, string s)
47 	{
48 		errors ~= Error(input.file, at, formatMessage(input, at, s));
49 	}
50 
51 	void expect(ref DietInput input, size_t at, string expectation,
52 			string srcfile = __FILE__, size_t srcline = __LINE__)
53 	{
54 		foreach (ref error; errors)
55 		{
56 			if (error.file == input.file && error.at == at && error.message.startsWith("Expected "))
57 			{
58 				error.message ~= ", " ~ expectation;
59 				return;
60 			}
61 		}
62 		string prefix;
63 		debug prefix = "(src=" ~ srcfile ~ ":" ~ srcline.to!string ~ ") ";
64 		errors ~= Error(input.file, at, prefix ~ formatMessage(input, at, "Expected " ~ expectation));
65 	}
66 }
67 
68 /// zero-based [line, column]
69 alias Position = size_t[2];
70 
71 Position bytesToPosition(string code, size_t bytes)
72 {
73 	size_t line, column;
74 	bool wasCR;
75 	size_t i;
76 	while (code.length)
77 	{
78 		if (i >= bytes)
79 			break;
80 		size_t len;
81 		immutable c = code.decodeFront(len);
82 		i += len;
83 		if (c == '\r')
84 		{
85 			wasCR = true;
86 		}
87 		else if (c == '\n')
88 		{
89 			line++;
90 			column = 0;
91 			wasCR = false;
92 		}
93 		else
94 		{
95 			if (wasCR)
96 			{
97 				line++;
98 				column = 0;
99 			}
100 			wasCR = false;
101 		}
102 	}
103 	if (wasCR)
104 	{
105 		line++;
106 		column = 0;
107 	}
108 	return [line, column];
109 }
110 
111 enum IndentStyle
112 {
113 	unknown,
114 	spaces,
115 	tabs
116 }
117 
118 struct DietInput
119 {
120 	string file;
121 	string code;
122 	size_t index;
123 	int tabSize = 4;
124 	size_t[] indentation;
125 	IndentStyle indentationStyle;
126 	Token last;
127 	Token[] backlog;
128 	bool lastWasNewline = true;
129 
130 	size_t indexEOL() @property const
131 	{
132 		string pre = read([0, index]);
133 		if (pre.endsWith("\r\n"))
134 			return index - 2;
135 		else if (pre.endsWith("\r", "\n"))
136 			return index - 1;
137 		else
138 			return index;
139 	}
140 
141 	static DietInput fromFile(R)(R file)
142 	{
143 		import std.file : readText;
144 
145 		DietInput ret;
146 		ret.code = readText(file);
147 		ret.file = file.to!string;
148 		return ret;
149 	}
150 
151 	ErrorContext errors;
152 
153 	size_t determineIndentation(string whitespace, out bool error)
154 	{
155 		assert(whitespace.byDchar.all!isWhite);
156 		size_t indentation; // @suppress(dscanner.suspicious.label_var_same_name)
157 		foreach (c; whitespace.byDchar)
158 		{
159 			if (c == '\t')
160 			{
161 				if (indentationStyle == IndentStyle.unknown)
162 					indentationStyle = IndentStyle.tabs;
163 				else if (indentationStyle == IndentStyle.spaces)
164 					error = true;
165 				indentation = (indentation / tabSize + 1) * tabSize;
166 			}
167 			else
168 			{
169 				if (indentationStyle == IndentStyle.unknown)
170 					indentationStyle = IndentStyle.spaces;
171 				else if (indentationStyle == IndentStyle.tabs)
172 					error = true;
173 				indentation++;
174 			}
175 		}
176 		return indentation;
177 	}
178 
179 	string read(size_t[2] range) const
180 	{
181 		if (range[1] < range[0])
182 			return null;
183 		if (range[0] < 0)
184 			range[0] = 0;
185 		if (range[1] > code.length)
186 			range[1] = code.length;
187 		return code[range[0] .. range[1]];
188 	}
189 
190 	void reset()
191 	{
192 		index = 0;
193 		indentation.length = 0;
194 		backlog.length = 0;
195 		last = Token.init;
196 		indentationStyle = IndentStyle.unknown;
197 		lastWasNewline = true;
198 	}
199 
200 	/// Checks if the current token matches type & optional match.
201 	bool peek(TokenType type, string match = null)
202 	{
203 		auto t = front();
204 		if (t.type != type)
205 			return false;
206 		if (match !is null)
207 			return t.content == match;
208 		return !empty || type == TokenType.eof;
209 	}
210 
211 	/// Does a peek and advances a token.
212 	bool match(TokenType type, string match = null)
213 	{
214 		auto ret = peek(type, match);
215 		popFront();
216 		return ret;
217 	}
218 
219 	/// Does a peek and advances a token and adds an error if it doesn't match.
220 	bool expect(TokenType type, string match = null, string srcfile = __FILE__,
221 			size_t srcline = __LINE__)
222 	{
223 		size_t at = index;
224 		Token tok = front;
225 		auto ret = this.match(type, match);
226 		if (!ret)
227 			errors.expect(this, at, (match.length
228 					? "'" ~ match ~ "'" : type.to!string) ~ ", but got " ~ tok.to!string, srcfile, srcline);
229 		return ret;
230 	}
231 
232 	size_t skipAll(TokenType[] types...)
233 	{
234 		size_t n;
235 		while (types.canFind(front.type))
236 		{
237 			popFront();
238 			n++;
239 		}
240 		return n;
241 	}
242 
243 	size_t[] skipAllCount(TokenType[] types...)
244 	{
245 		size_t[] n = new size_t[](types.length);
246 		while (true)
247 		{
248 			auto i = types.countUntil(front.type);
249 			if (i < 0)
250 				break;
251 			popFront();
252 			n[i]++;
253 		}
254 		return n;
255 	}
256 
257 	auto save()
258 	{
259 		auto copy = this;
260 		copy.indentation = indentation.dup;
261 		copy.backlog = backlog.dup;
262 		copy.errors = errors.dup;
263 		return copy;
264 	}
265 
266 	void popFront()
267 	{
268 		if (backlog.length)
269 		{
270 			lastWasNewline = backlog[$ - 1].type == TokenType.newline;
271 			backlog.length--;
272 		}
273 		else if (index >= code.length)
274 		{
275 			index++;
276 			if (index > code.length + 100)
277 				throw new Exception("Attempted to read past EOF too often");
278 		}
279 	}
280 
281 	bool empty() @property
282 	{
283 		return index >= code.length && backlog.length == 0 && indentation.length == 0;
284 	}
285 
286 	auto front() @property
287 	{
288 		if (index >= code.length && backlog.length == 0 && indentation.length > 0)
289 		{
290 			if (indentation.length)
291 			{
292 				backlog.length = indentation.length + 1;
293 				backlog[0] = Token(TokenType.newline, null, [index, index]);
294 				foreach (i, indent; indentation)
295 					backlog[i + 1] = Token(TokenType.detent, null, [index, index]);
296 				backlog.reverse();
297 			}
298 			else
299 			{
300 				backlog = null;
301 			}
302 			indentation.length = 0;
303 		}
304 		if (index >= code.length && backlog.length == 0)
305 			return Token(TokenType.eof, null, [index, index]);
306 		else
307 		{
308 			while (!backlog.length)
309 			{
310 				backlog = parse();
311 				backlog.reverse();
312 			}
313 			return last = backlog[$ - 1];
314 		}
315 	}
316 
317 	private Token[] parse()
318 	{
319 		const size_t start = index;
320 		if (index >= code.length)
321 		{
322 			Token[] ret;
323 			foreach_reverse (indent; indentation)
324 			{
325 				ret ~= Token(TokenType.detent, null, [start, index]);
326 				indentation.length--;
327 			}
328 			ret ~= Token(TokenType.eof, null, [start, index]);
329 			return ret;
330 		}
331 		size_t dummy = index;
332 		auto c = decode(code, dummy);
333 		const size_t cLength = dummy - index;
334 
335 		// skip start of file whitespace
336 		if (index == 0 && c.isWhite)
337 		{
338 			size_t prev;
339 			while (c.isWhite)
340 			{
341 				prev = index;
342 				c = decode(code, index);
343 			}
344 			index = prev;
345 			return [];
346 		}
347 
348 		if (c == '\r')
349 		{
350 			index++;
351 			if (index < code.length && code[index] == '\n')
352 				index++;
353 			return [Token(TokenType.newline, code[start .. index], [start, index])];
354 		}
355 		else if (c == '\n')
356 		{
357 			index++;
358 			return [Token(TokenType.newline, code[start .. index], [start, index])];
359 		}
360 		else if (c.isWhiteButNotNewline)
361 		{
362 			const isIndent = last.type == TokenType.newline;
363 			string data = code[start .. $];
364 			bool uselessWhitespace;
365 			while (data.length)
366 			{
367 				size_t len;
368 				c = data.decodeFront(len);
369 				if (c == '\r' || c == '\n')
370 					uselessWhitespace = true;
371 				if (!c.isWhiteButNotNewline)
372 					break;
373 				index += len;
374 			}
375 			assert(start != index);
376 			if (uselessWhitespace && lastWasNewline)
377 				return [];
378 			if (isIndent)
379 			{
380 				bool error;
381 				auto level = determineIndentation(code[start .. index], error);
382 				if (error)
383 					errors.error(this, start, "Mixing spaces and tabs indentation");
384 				Token[] ret;
385 				if (!indentation.length || indentation[$ - 1] < level)
386 				{
387 					ret ~= Token(TokenType.indent, code[start .. index], [start, index]);
388 					indentation ~= level;
389 				}
390 				else
391 				{
392 					foreach_reverse (indent; indentation)
393 					{
394 						if (indent <= level)
395 							break;
396 						ret ~= Token(TokenType.detent, code[start .. index], [start, index]);
397 						indentation.length--;
398 					}
399 				}
400 				return ret;
401 			}
402 			else
403 				return [
404 					Token(TokenType.whitespace, code[start .. index], [start, index])
405 				];
406 		}
407 		else if (c.tagIdentifierValidator)
408 		{
409 			string data = code[start .. $];
410 			while (data.length)
411 			{
412 				size_t len;
413 				c = data.decodeFront(len);
414 				if (!c.tagIdentifierValidator)
415 					break;
416 				index += len;
417 			}
418 			return [Token(TokenType.identifier, code[start .. index], [start, index])];
419 		}
420 		else
421 		{
422 			index += cLength;
423 			return [Token(TokenType.raw, code[start .. index], [start, index])];
424 		}
425 	}
426 }
427 
428 /// Advances the input and returns true in case the tokens start with this match.
429 /// Warning: on match this will start a next token if it contains part of the match, discarding it basically.
430 bool matchText(ref DietInput input, string match)
431 {
432 	auto past = input.save();
433 	size_t i = 0;
434 	while (i < match.length && !input.empty)
435 	{
436 		auto p = input.front;
437 		input.popFront();
438 		if (match.length - i >= p.content.length)
439 		{
440 			if (match[i .. i += p.content.length] != p.content)
441 			{
442 				input = past;
443 				return false;
444 			}
445 		}
446 		else
447 		{
448 			if (match[i .. $] != p.content[0 .. match.length - i])
449 			{
450 				input = past;
451 				return false;
452 			}
453 			i = match.length;
454 		}
455 	}
456 	return i >= match.length;
457 }
458 
459 bool isWhiteButNotNewline(dchar c)
460 {
461 	return c.isWhite && c != '\n' && c != '\r';
462 }
463 
464 alias tagIdentifierValidator = (c) => c == '-' || c == ':' || c == '_' || c.isAlphaNum;
465 /// [-:_0-9a-zA-Z]+
466 bool validateTagIdentifier(string s)
467 {
468 	return s.byDchar.all!tagIdentifierValidator;
469 }
470 
471 /// [-_0-9a-zA-Z]+
472 bool validateIdentifier(string s)
473 {
474 	return s.byDchar.all!(c => c == '-' || c == '_' || c.isAlphaNum);
475 }
476 
477 /// [-_a-zA-Z] [-_0-9a-zA-Z]*
478 bool validateIdentifierAlpha(string s)
479 {
480 	bool first = true;
481 	return s.byDchar.all!((c) {
482 		auto ret = c == '-' || c == '_' || c.isAlpha || (!first && c.isDigit);
483 		first = false;
484 		return ret;
485 	});
486 }
487 
488 string indent(string s, string indentation = "\t")
489 {
490 	return s.lineSplitter!(KeepTerminator.yes)
491 		.map!(a => a.strip.length ? indentation ~ a : a)
492 		.join("");
493 }
494 
495 unittest
496 {
497 	import std.array : array;
498 
499 	DietInput input;
500 	input.file = "stdin";
501 	input.code = q{doctype html
502 html
503 
504 };
505 
506 	auto tokens = input.array;
507 	assert(input.errors.length == 0);
508 
509 	with (TokenType)
510 		assert(tokens == [
511 				Token(identifier, "doctype", [0, 7]), Token(whitespace, " ", [7, 8]),
512 				Token(identifier, "html", [8, 12]), Token(newline, "\n", [12, 13]),
513 				Token(identifier, "html", [13, 17]), Token(newline, "\n", [17, 18]),
514 				Token(newline, "\n", [18, 19])
515 				]);
516 }