1 module dietc.lexer; 2 3 import std.algorithm; 4 import std.ascii; 5 import std.conv; 6 import std.string; 7 import std.utf; 8 9 enum TokenType 10 { 11 raw, 12 indent, 13 detent, 14 identifier, 15 code, 16 newline, 17 whitespace, 18 eof 19 } 20 21 struct Token 22 { 23 TokenType type; 24 string content; 25 size_t[2] range; 26 } 27 28 struct ErrorContext 29 { 30 struct Error 31 { 32 string file; 33 size_t at; 34 string message; 35 } 36 37 Error[] errors; 38 alias errors this; 39 40 string formatMessage(ref DietInput input, size_t at, string s) 41 { 42 auto pos = input.code.bytesToPosition(at); 43 return input.file ~ "(" ~ to!string(pos[0] + 1) ~ ":" ~ to!string(pos[1] + 1) ~ "): " ~ s; 44 } 45 46 void error(ref DietInput input, size_t at, string s) 47 { 48 errors ~= Error(input.file, at, formatMessage(input, at, s)); 49 } 50 51 void expect(ref DietInput input, size_t at, string expectation, 52 string srcfile = __FILE__, size_t srcline = __LINE__) 53 { 54 foreach (ref error; errors) 55 { 56 if (error.file == input.file && error.at == at && error.message.startsWith("Expected ")) 57 { 58 error.message ~= ", " ~ expectation; 59 return; 60 } 61 } 62 string prefix; 63 debug prefix = "(src=" ~ srcfile ~ ":" ~ srcline.to!string ~ ") "; 64 errors ~= Error(input.file, at, prefix ~ formatMessage(input, at, "Expected " ~ expectation)); 65 } 66 } 67 68 /// zero-based [line, column] 69 alias Position = size_t[2]; 70 71 Position bytesToPosition(string code, size_t bytes) 72 { 73 size_t line, column; 74 bool wasCR; 75 size_t i; 76 while (code.length) 77 { 78 if (i >= bytes) 79 break; 80 size_t len; 81 immutable c = code.decodeFront(len); 82 i += len; 83 if (c == '\r') 84 { 85 wasCR = true; 86 } 87 else if (c == '\n') 88 { 89 line++; 90 column = 0; 91 wasCR = false; 92 } 93 else 94 { 95 if (wasCR) 96 { 97 line++; 98 column = 0; 99 } 100 wasCR = false; 101 } 102 } 103 if (wasCR) 104 { 105 line++; 106 column = 0; 107 } 108 return [line, column]; 109 } 110 111 enum IndentStyle 112 { 113 unknown, 114 spaces, 115 tabs 116 } 117 118 struct DietInput 119 { 120 string file; 121 string code; 122 size_t index; 123 int tabSize = 4; 124 size_t[] indentation; 125 IndentStyle indentationStyle; 126 Token last; 127 Token[] backlog; 128 bool lastWasNewline = true; 129 130 size_t indexEOL() @property const 131 { 132 string pre = read([0, index]); 133 if (pre.endsWith("\r\n")) 134 return index - 2; 135 else if (pre.endsWith("\r", "\n")) 136 return index - 1; 137 else 138 return index; 139 } 140 141 static DietInput fromFile(R)(R file) 142 { 143 import std.file : readText; 144 145 DietInput ret; 146 ret.code = readText(file); 147 ret.file = file.to!string; 148 return ret; 149 } 150 151 ErrorContext errors; 152 153 size_t determineIndentation(string whitespace, out bool error) 154 { 155 assert(whitespace.byDchar.all!isWhite); 156 size_t indentation; // @suppress(dscanner.suspicious.label_var_same_name) 157 foreach (c; whitespace.byDchar) 158 { 159 if (c == '\t') 160 { 161 if (indentationStyle == IndentStyle.unknown) 162 indentationStyle = IndentStyle.tabs; 163 else if (indentationStyle == IndentStyle.spaces) 164 error = true; 165 indentation = (indentation / tabSize + 1) * tabSize; 166 } 167 else 168 { 169 if (indentationStyle == IndentStyle.unknown) 170 indentationStyle = IndentStyle.spaces; 171 else if (indentationStyle == IndentStyle.tabs) 172 error = true; 173 indentation++; 174 } 175 } 176 return indentation; 177 } 178 179 string read(size_t[2] range) const 180 { 181 if (range[1] < range[0]) 182 return null; 183 if (range[0] < 0) 184 range[0] = 0; 185 if (range[1] > code.length) 186 range[1] = code.length; 187 return code[range[0] .. range[1]]; 188 } 189 190 void reset() 191 { 192 index = 0; 193 indentation.length = 0; 194 backlog.length = 0; 195 last = Token.init; 196 indentationStyle = IndentStyle.unknown; 197 lastWasNewline = true; 198 } 199 200 /// Checks if the current token matches type & optional match. 201 bool peek(TokenType type, string match = null) 202 { 203 auto t = front(); 204 if (t.type != type) 205 return false; 206 if (match !is null) 207 return t.content == match; 208 return !empty || type == TokenType.eof; 209 } 210 211 /// Does a peek and advances a token. 212 bool match(TokenType type, string match = null) 213 { 214 auto ret = peek(type, match); 215 popFront(); 216 return ret; 217 } 218 219 /// Does a peek and advances a token and adds an error if it doesn't match. 220 bool expect(TokenType type, string match = null, string srcfile = __FILE__, 221 size_t srcline = __LINE__) 222 { 223 size_t at = index; 224 Token tok = front; 225 auto ret = this.match(type, match); 226 if (!ret) 227 errors.expect(this, at, (match.length 228 ? "'" ~ match ~ "'" : type.to!string) ~ ", but got " ~ tok.to!string, srcfile, srcline); 229 return ret; 230 } 231 232 size_t skipAll(TokenType[] types...) 233 { 234 size_t n; 235 while (types.canFind(front.type)) 236 { 237 popFront(); 238 n++; 239 } 240 return n; 241 } 242 243 size_t[] skipAllCount(TokenType[] types...) 244 { 245 size_t[] n = new size_t[](types.length); 246 while (true) 247 { 248 auto i = types.countUntil(front.type); 249 if (i < 0) 250 break; 251 popFront(); 252 n[i]++; 253 } 254 return n; 255 } 256 257 auto save() 258 { 259 auto copy = this; 260 copy.indentation = indentation.dup; 261 copy.backlog = backlog.dup; 262 copy.errors = errors.dup; 263 return copy; 264 } 265 266 void popFront() 267 { 268 if (backlog.length) 269 { 270 lastWasNewline = backlog[$ - 1].type == TokenType.newline; 271 backlog.length--; 272 } 273 else if (index >= code.length) 274 { 275 index++; 276 if (index > code.length + 100) 277 throw new Exception("Attempted to read past EOF too often"); 278 } 279 } 280 281 bool empty() @property 282 { 283 return index >= code.length && backlog.length == 0 && indentation.length == 0; 284 } 285 286 auto front() @property 287 { 288 if (index >= code.length && backlog.length == 0 && indentation.length > 0) 289 { 290 if (indentation.length) 291 { 292 backlog.length = indentation.length + 1; 293 backlog[0] = Token(TokenType.newline, null, [index, index]); 294 foreach (i, indent; indentation) 295 backlog[i + 1] = Token(TokenType.detent, null, [index, index]); 296 backlog.reverse(); 297 } 298 else 299 { 300 backlog = null; 301 } 302 indentation.length = 0; 303 } 304 if (index >= code.length && backlog.length == 0) 305 return Token(TokenType.eof, null, [index, index]); 306 else 307 { 308 while (!backlog.length) 309 { 310 backlog = parse(); 311 backlog.reverse(); 312 } 313 return last = backlog[$ - 1]; 314 } 315 } 316 317 private Token[] parse() 318 { 319 const size_t start = index; 320 if (index >= code.length) 321 { 322 Token[] ret; 323 foreach_reverse (indent; indentation) 324 { 325 ret ~= Token(TokenType.detent, null, [start, index]); 326 indentation.length--; 327 } 328 ret ~= Token(TokenType.eof, null, [start, index]); 329 return ret; 330 } 331 size_t dummy = index; 332 auto c = decode(code, dummy); 333 const size_t cLength = dummy - index; 334 335 // skip start of file whitespace 336 if (index == 0 && c.isWhite) 337 { 338 size_t prev; 339 while (c.isWhite) 340 { 341 prev = index; 342 c = decode(code, index); 343 } 344 index = prev; 345 return []; 346 } 347 348 if (c == '\r') 349 { 350 index++; 351 if (index < code.length && code[index] == '\n') 352 index++; 353 return [Token(TokenType.newline, code[start .. index], [start, index])]; 354 } 355 else if (c == '\n') 356 { 357 index++; 358 return [Token(TokenType.newline, code[start .. index], [start, index])]; 359 } 360 else if (c.isWhiteButNotNewline) 361 { 362 const isIndent = last.type == TokenType.newline; 363 string data = code[start .. $]; 364 bool uselessWhitespace; 365 while (data.length) 366 { 367 size_t len; 368 c = data.decodeFront(len); 369 if (c == '\r' || c == '\n') 370 uselessWhitespace = true; 371 if (!c.isWhiteButNotNewline) 372 break; 373 index += len; 374 } 375 assert(start != index); 376 if (uselessWhitespace && lastWasNewline) 377 return []; 378 if (isIndent) 379 { 380 bool error; 381 auto level = determineIndentation(code[start .. index], error); 382 if (error) 383 errors.error(this, start, "Mixing spaces and tabs indentation"); 384 Token[] ret; 385 if (!indentation.length || indentation[$ - 1] < level) 386 { 387 ret ~= Token(TokenType.indent, code[start .. index], [start, index]); 388 indentation ~= level; 389 } 390 else 391 { 392 foreach_reverse (indent; indentation) 393 { 394 if (indent <= level) 395 break; 396 ret ~= Token(TokenType.detent, code[start .. index], [start, index]); 397 indentation.length--; 398 } 399 } 400 return ret; 401 } 402 else 403 return [ 404 Token(TokenType.whitespace, code[start .. index], [start, index]) 405 ]; 406 } 407 else if (c.tagIdentifierValidator) 408 { 409 string data = code[start .. $]; 410 while (data.length) 411 { 412 size_t len; 413 c = data.decodeFront(len); 414 if (!c.tagIdentifierValidator) 415 break; 416 index += len; 417 } 418 return [Token(TokenType.identifier, code[start .. index], [start, index])]; 419 } 420 else 421 { 422 index += cLength; 423 return [Token(TokenType.raw, code[start .. index], [start, index])]; 424 } 425 } 426 } 427 428 /// Advances the input and returns true in case the tokens start with this match. 429 /// Warning: on match this will start a next token if it contains part of the match, discarding it basically. 430 bool matchText(ref DietInput input, string match) 431 { 432 auto past = input.save(); 433 size_t i = 0; 434 while (i < match.length && !input.empty) 435 { 436 auto p = input.front; 437 input.popFront(); 438 if (match.length - i >= p.content.length) 439 { 440 if (match[i .. i += p.content.length] != p.content) 441 { 442 input = past; 443 return false; 444 } 445 } 446 else 447 { 448 if (match[i .. $] != p.content[0 .. match.length - i]) 449 { 450 input = past; 451 return false; 452 } 453 i = match.length; 454 } 455 } 456 return i >= match.length; 457 } 458 459 bool isWhiteButNotNewline(dchar c) 460 { 461 return c.isWhite && c != '\n' && c != '\r'; 462 } 463 464 alias tagIdentifierValidator = (c) => c == '-' || c == ':' || c == '_' || c.isAlphaNum; 465 /// [-:_0-9a-zA-Z]+ 466 bool validateTagIdentifier(string s) 467 { 468 return s.byDchar.all!tagIdentifierValidator; 469 } 470 471 /// [-_0-9a-zA-Z]+ 472 bool validateIdentifier(string s) 473 { 474 return s.byDchar.all!(c => c == '-' || c == '_' || c.isAlphaNum); 475 } 476 477 /// [-_a-zA-Z] [-_0-9a-zA-Z]* 478 bool validateIdentifierAlpha(string s) 479 { 480 bool first = true; 481 return s.byDchar.all!((c) { 482 auto ret = c == '-' || c == '_' || c.isAlpha || (!first && c.isDigit); 483 first = false; 484 return ret; 485 }); 486 } 487 488 string indent(string s, string indentation = "\t") 489 { 490 return s.lineSplitter!(KeepTerminator.yes) 491 .map!(a => a.strip.length ? indentation ~ a : a) 492 .join(""); 493 } 494 495 unittest 496 { 497 import std.array : array; 498 499 DietInput input; 500 input.file = "stdin"; 501 input.code = q{doctype html 502 html 503 504 }; 505 506 auto tokens = input.array; 507 assert(input.errors.length == 0); 508 509 with (TokenType) 510 assert(tokens == [ 511 Token(identifier, "doctype", [0, 7]), Token(whitespace, " ", [7, 8]), 512 Token(identifier, "html", [8, 12]), Token(newline, "\n", [12, 13]), 513 Token(identifier, "html", [13, 17]), Token(newline, "\n", [17, 18]), 514 Token(newline, "\n", [18, 19]) 515 ]); 516 }