1 module dietc.lexer; 2 3 import std.algorithm; 4 import std.ascii; 5 import std.conv; 6 import std..string; 7 import std.utf; 8 9 enum TokenType 10 { 11 raw, 12 indent, 13 detent, 14 identifier, 15 code, 16 newline, 17 whitespace, 18 eof 19 } 20 21 struct Token 22 { 23 TokenType type; 24 string content; 25 size_t[2] range; 26 } 27 28 struct ErrorContext 29 { 30 struct Error 31 { 32 string file; 33 size_t at; 34 string message; 35 } 36 37 Error[] errors; 38 alias errors this; 39 40 string formatMessage(ref DietInput input, size_t at, string s) 41 { 42 auto pos = input.code.bytesToPosition(at); 43 return input.file ~ "(" ~ to!string(pos[0] + 1) ~ ":" ~ to!string(pos[1] + 1) ~ "): " ~ s; 44 } 45 46 void error(ref DietInput input, size_t at, string s) 47 { 48 errors ~= Error(input.file, at, formatMessage(input, at, s)); 49 } 50 51 void expect(ref DietInput input, size_t at, string expectation, 52 string srcfile = __FILE__, size_t srcline = __LINE__) 53 { 54 foreach (ref error; errors) 55 { 56 if (error.file == input.file && error.at == at && error.message.startsWith("Expected ")) 57 { 58 error.message ~= ", " ~ expectation; 59 return; 60 } 61 } 62 string prefix; 63 debug prefix = "(src=" ~ srcfile ~ ":" ~ srcline.to!string ~ ") "; 64 errors ~= Error(input.file, at, prefix ~ formatMessage(input, at, "Expected " ~ expectation)); 65 } 66 } 67 68 /// zero-based [line, column] 69 alias Position = size_t[2]; 70 71 Position bytesToPosition(string code, size_t bytes) 72 { 73 size_t line, column; 74 bool wasCR; 75 size_t i; 76 while (code.length) 77 { 78 if (i >= bytes) 79 break; 80 size_t len; 81 immutable c = code.decodeFront(len); 82 i += len; 83 if (c == '\r') 84 { 85 wasCR = true; 86 } 87 else if (c == '\n') 88 { 89 line++; 90 column = 0; 91 wasCR = false; 92 } 93 else 94 { 95 if (wasCR) 96 { 97 line++; 98 column = 0; 99 } 100 wasCR = false; 101 } 102 } 103 if (wasCR) 104 { 105 line++; 106 column = 0; 107 } 108 return [line, column]; 109 } 110 111 enum IndentStyle 112 { 113 unknown, 114 spaces, 115 tabs 116 } 117 118 struct DietInput 119 { 120 string file; 121 string code; 122 size_t index; 123 int tabSize = 4; 124 size_t[] indentation; 125 IndentStyle indentationStyle; 126 Token last; 127 Token[] backlog; 128 bool lastWasNewline = true; 129 130 static DietInput fromFile(R)(R file) 131 { 132 import std.file : readText; 133 134 DietInput ret; 135 ret.code = readText(file); 136 ret.file = file.to!string; 137 return ret; 138 } 139 140 ErrorContext errors; 141 142 size_t determineIndentation(string whitespace, out bool error) 143 { 144 assert(whitespace.byDchar.all!isWhite); 145 size_t indentation; // @suppress(dscanner.suspicious.label_var_same_name) 146 foreach (c; whitespace.byDchar) 147 { 148 if (c == '\t') 149 { 150 if (indentationStyle == IndentStyle.unknown) 151 indentationStyle = IndentStyle.tabs; 152 else if (indentationStyle == IndentStyle.spaces) 153 error = true; 154 indentation = (indentation / tabSize + 1) * tabSize; 155 } 156 else 157 { 158 if (indentationStyle == IndentStyle.unknown) 159 indentationStyle = IndentStyle.spaces; 160 else if (indentationStyle == IndentStyle.tabs) 161 error = true; 162 indentation++; 163 } 164 } 165 return indentation; 166 } 167 168 string read(size_t[2] range) 169 { 170 if (range[1] < range[0]) 171 return null; 172 if (range[0] < 0) 173 range[0] = 0; 174 if (range[1] > code.length) 175 range[1] = code.length; 176 return code[range[0] .. range[1]]; 177 } 178 179 void reset() 180 { 181 index = 0; 182 indentation.length = 0; 183 backlog.length = 0; 184 last = Token.init; 185 indentationStyle = IndentStyle.unknown; 186 lastWasNewline = true; 187 } 188 189 /// Checks if the current token matches type & optional match. 190 bool peek(TokenType type, string match = null) 191 { 192 auto t = front(); 193 if (t.type != type) 194 return false; 195 if (match !is null) 196 return t.content == match; 197 return !empty || type == TokenType.eof; 198 } 199 200 /// Does a peek and advances a token. 201 bool match(TokenType type, string match = null) 202 { 203 auto ret = peek(type, match); 204 popFront(); 205 return ret; 206 } 207 208 /// Does a peek and advances a token and adds an error if it doesn't match. 209 bool expect(TokenType type, string match = null, string srcfile = __FILE__, 210 size_t srcline = __LINE__) 211 { 212 size_t at = index; 213 Token tok = front; 214 auto ret = this.match(type, match); 215 if (!ret) 216 errors.expect(this, at, (match.length ? "'" ~ match ~ "'" 217 : type.to!string) ~ ", but got " ~ tok.to!string, srcfile, srcline); 218 return ret; 219 } 220 221 size_t skipAll(TokenType[] types...) 222 { 223 size_t n; 224 while (types.canFind(front.type)) 225 { 226 popFront(); 227 n++; 228 } 229 return n; 230 } 231 232 size_t[] skipAllCount(TokenType[] types...) 233 { 234 size_t[] n = new size_t[](types.length); 235 while (true) 236 { 237 auto i = types.countUntil(front.type); 238 if (i < 0) 239 break; 240 popFront(); 241 n[i]++; 242 } 243 return n; 244 } 245 246 auto save() 247 { 248 auto copy = this; 249 copy.indentation = indentation.dup; 250 copy.backlog = backlog.dup; 251 copy.errors = errors.dup; 252 return copy; 253 } 254 255 void popFront() 256 { 257 if (backlog.length) 258 { 259 lastWasNewline = backlog[$ - 1].type == TokenType.newline; 260 backlog.length--; 261 } 262 else if (index >= code.length) 263 { 264 index++; 265 if (index > code.length + 100) 266 throw new Exception("Attempted to read past EOF too often"); 267 } 268 } 269 270 bool empty() @property 271 { 272 return index >= code.length && backlog.length == 0 && indentation.length == 0; 273 } 274 275 auto front() @property 276 { 277 if (index >= code.length && backlog.length == 0 && indentation.length > 0) 278 { 279 if (indentation.length) 280 { 281 backlog.length = indentation.length + 1; 282 backlog[0] = Token(TokenType.newline, null, [index, index]); 283 foreach (i, indent; indentation) 284 backlog[i + 1] = Token(TokenType.detent, null, [index, index]); 285 backlog.reverse(); 286 } 287 else 288 { 289 backlog = null; 290 } 291 indentation.length = 0; 292 } 293 if (index >= code.length && backlog.length == 0) 294 return Token(TokenType.eof, null, [index, index]); 295 else 296 { 297 while (!backlog.length) 298 { 299 backlog = parse(); 300 backlog.reverse(); 301 } 302 return last = backlog[$ - 1]; 303 } 304 } 305 306 private Token[] parse() 307 { 308 const size_t start = index; 309 if (index >= code.length) 310 { 311 Token[] ret; 312 foreach_reverse (indent; indentation) 313 { 314 ret ~= Token(TokenType.detent, null, [start, index]); 315 indentation.length--; 316 } 317 ret ~= Token(TokenType.eof, null, [start, index]); 318 return ret; 319 } 320 size_t dummy = index; 321 auto c = decode(code, dummy); 322 const size_t cLength = dummy - index; 323 324 // skip start of file whitespace 325 if (index == 0 && c.isWhite) 326 { 327 size_t prev; 328 while (c.isWhite) 329 { 330 prev = index; 331 c = decode(code, index); 332 } 333 index = prev; 334 return []; 335 } 336 337 if (c == '\r') 338 { 339 index++; 340 if (index < code.length && code[index] == '\n') 341 index++; 342 return [Token(TokenType.newline, code[start .. index], [start, index])]; 343 } 344 else if (c == '\n') 345 { 346 index++; 347 return [Token(TokenType.newline, code[start .. index], [start, index])]; 348 } 349 else if (c.isWhiteButNotNewline) 350 { 351 const isIndent = last.type == TokenType.newline; 352 string data = code[start .. $]; 353 bool uselessWhitespace; 354 while (data.length) 355 { 356 size_t len; 357 c = data.decodeFront(len); 358 if (c == '\r' || c == '\n') 359 uselessWhitespace = true; 360 if (!c.isWhiteButNotNewline) 361 break; 362 index += len; 363 } 364 assert(start != index); 365 if (uselessWhitespace && lastWasNewline) 366 return []; 367 if (isIndent) 368 { 369 bool error; 370 auto level = determineIndentation(code[start .. index], error); 371 if (error) 372 errors.error(this, start, "Mixing spaces and tabs indentation"); 373 Token[] ret; 374 if (!indentation.length || indentation[$ - 1] < level) 375 { 376 ret ~= Token(TokenType.indent, code[start .. index], [start, index]); 377 indentation ~= level; 378 } 379 else 380 { 381 foreach_reverse (indent; indentation) 382 { 383 if (indent <= level) 384 break; 385 ret ~= Token(TokenType.detent, code[start .. index], [start, index]); 386 indentation.length--; 387 } 388 } 389 return ret; 390 } 391 else 392 return [Token(TokenType.whitespace, code[start .. index], [start, index])]; 393 } 394 else if (c.tagIdentifierValidator) 395 { 396 string data = code[start .. $]; 397 while (data.length) 398 { 399 size_t len; 400 c = data.decodeFront(len); 401 if (!c.tagIdentifierValidator) 402 break; 403 index += len; 404 } 405 return [Token(TokenType.identifier, code[start .. index], [start, index])]; 406 } 407 else 408 { 409 index += cLength; 410 return [Token(TokenType.raw, code[start .. index], [start, index])]; 411 } 412 } 413 } 414 415 /// Advances the input and returns true in case the tokens start with this match. 416 /// Warning: on match this will start a next token if it contains part of the match, discarding it basically. 417 bool matchText(ref DietInput input, string match) 418 { 419 auto past = input.save(); 420 size_t i = 0; 421 while (i < match.length && !input.empty) 422 { 423 auto p = input.front; 424 input.popFront(); 425 if (match.length - i >= p.content.length) 426 { 427 if (match[i .. i += p.content.length] != p.content) 428 { 429 input = past; 430 return false; 431 } 432 } 433 else 434 { 435 if (match[i .. $] != p.content[0 .. match.length - i]) 436 { 437 input = past; 438 return false; 439 } 440 i = match.length; 441 } 442 } 443 return i >= match.length; 444 } 445 446 bool isWhiteButNotNewline(dchar c) 447 { 448 return c.isWhite && c != '\n' && c != '\r'; 449 } 450 451 alias tagIdentifierValidator = (c) => c == '-' || c == ':' || c == '_' || c.isAlphaNum; 452 /// [-:_0-9a-zA-Z]+ 453 bool validateTagIdentifier(string s) 454 { 455 return s.byDchar.all!tagIdentifierValidator; 456 } 457 458 /// [-_0-9a-zA-Z]+ 459 bool validateIdentifier(string s) 460 { 461 return s.byDchar.all!(c => c == '-' || c == '_' || c.isAlphaNum); 462 } 463 464 /// [-_a-zA-Z] [-_0-9a-zA-Z]* 465 bool validateIdentifierAlpha(string s) 466 { 467 bool first = true; 468 return s.byDchar.all!((c) { 469 auto ret = c == '-' || c == '_' || c.isAlpha || (!first && c.isDigit); 470 first = false; 471 return ret; 472 }); 473 } 474 475 string indent(string s, string indentation = "\t") 476 { 477 return s.lineSplitter!(KeepTerminator.yes) 478 .map!(a => a.strip.length ? indentation ~ a : a) 479 .join(""); 480 }