1 /// Simple source code splitter 2 /// Written by Vladimir Panteleev <vladimir@thecybershadow.net> 3 /// Released into the Public Domain 4 5 module splitter; 6 7 import std.ascii; 8 import std.algorithm; 9 import std.array; 10 import std.conv; 11 import std.exception; 12 import std.file; 13 import std.functional; 14 import std.path; 15 import std.range; 16 import std..string; 17 import std.traits; 18 import std.stdio : stderr; 19 20 /// Represents a slice of the original code. 21 class Entity 22 { 23 string head; /// This node's "head", e.g. "{" for a statement block. 24 Entity[] children; /// This node's children nodes, e.g. the statements of the statement block. 25 string tail; /// This node's "tail", e.g. "}" for a statement block. 26 27 string filename, contents; 28 @property bool isFile() { return filename != ""; } 29 30 bool isPair; /// Internal hint for --dump output 31 bool noRemove; /// Don't try removing this entity (children OK) 32 33 bool removed; /// For dangling dependencies 34 Entity[] dependencies; /// If any of these entities are omitted, so should this entity. 35 36 int id; /// For diagnostics 37 size_t descendants; /// For progress display 38 39 DSplitter.Token token; /// Used internally 40 41 this(string head = null, Entity[] children = null, string tail = null) 42 { 43 this.head = head; 44 this.children = children; 45 this.tail = tail; 46 } 47 48 string[] comments; 49 50 @property string comment() 51 { 52 string[] result = comments; 53 if (isPair) 54 { 55 assert(token == DSplitter.Token.none); 56 result ~= "Pair"; 57 } 58 if (token && DSplitter.tokenText[token]) 59 result ~= DSplitter.tokenText[token]; 60 return result.length ? result.join(" / ") : null; 61 } 62 63 override string toString() 64 { 65 return "%(%s%) %s %(%s%)".format([head], children, [tail]); 66 } 67 } 68 69 enum Mode 70 { 71 source, 72 words, /// split identifiers, for obfuscation 73 } 74 75 enum Splitter 76 { 77 files, /// Load entire files only 78 lines, /// Split by line ends 79 words, /// Split by whitespace 80 D, /// Parse D source code 81 diff, /// Unified diffs 82 } 83 immutable string[] splitterNames = [EnumMembers!Splitter].map!(e => e.text().toLower()).array(); 84 85 struct ParseRule 86 { 87 string pattern; 88 Splitter splitter; 89 } 90 91 struct ParseOptions 92 { 93 enum Mode { source, words } 94 95 bool stripComments; 96 ParseRule[] rules; 97 Mode mode; 98 } 99 100 /// Parse the given file/directory. 101 /// For files, modifies path to be the base name for .test / .reduced directories. 102 Entity loadFiles(ref string path, ParseOptions options) 103 { 104 if (isFile(path)) 105 { 106 auto filePath = path; 107 path = stripExtension(path); 108 return loadFile(filePath.baseName(), filePath, options); 109 } 110 else 111 { 112 auto set = new Entity(); 113 foreach (string entry; dirEntries(path, SpanMode.breadth).array.sort!((a, b) => a.name < b.name)) 114 if (isFile(entry)) 115 { 116 assert(entry.startsWith(path)); 117 auto name = entry[path.length+1..$]; 118 set.children ~= loadFile(name, entry, options); 119 } 120 return set; 121 } 122 } 123 124 enum BIN_SIZE = 2; 125 126 void optimize(Entity set) 127 { 128 static void group(ref Entity[] set, size_t start, size_t end) 129 { 130 //set = set[0..start] ~ [new Entity(removable, set[start..end])] ~ set[end..$]; 131 auto children = set[start..end].dup; 132 auto e = new Entity(null, children, null); 133 e.noRemove = children.any!(c => c.noRemove)(); 134 set.replaceInPlace(start, end, [e]); 135 } 136 137 static void clusterBy(ref Entity[] set, size_t binSize) 138 { 139 while (set.length > binSize) 140 { 141 auto size = set.length >= binSize*2 ? binSize : (set.length+1) / 2; 142 //auto size = binSize; 143 144 auto bins = set.length/size; 145 if (set.length % size > 1) 146 group(set, bins*size, set.length); 147 foreach_reverse (i; 0..bins) 148 group(set, i*size, (i+1)*size); 149 } 150 } 151 152 static void doOptimize(Entity e) 153 { 154 foreach (c; e.children) 155 doOptimize(c); 156 clusterBy(e.children, BIN_SIZE); 157 } 158 159 doOptimize(set); 160 } 161 162 private: 163 164 /// Override std.string nonsense, which does UTF-8 decoding 165 bool startsWith(in char[] big, in char[] small) { return big.length >= small.length && big[0..small.length] == small; } 166 bool startsWith(in char[] big, char c) { return big.length && big[0] == c; } 167 string strip(string s) { while (s.length && isWhite(s[0])) s = s[1..$]; while (s.length && isWhite(s[$-1])) s = s[0..$-1]; return s; } 168 169 immutable ParseRule[] defaultRules = 170 [ 171 { "*.d" , Splitter.D }, 172 { "*.di" , Splitter.D }, 173 { "*.diff" , Splitter.diff }, 174 { "*.patch", Splitter.diff }, 175 { "*" , Splitter.files }, 176 ]; 177 178 Entity loadFile(string name, string path, ParseOptions options) 179 { 180 stderr.writeln("Loading ", path); 181 auto result = new Entity(); 182 result.filename = name.replace(`\`, `/`); 183 result.contents = cast(string)read(path); 184 185 auto base = name.baseName(); 186 foreach (rule; chain(options.rules, defaultRules)) 187 if (base.globMatch(rule.pattern)) 188 { 189 final switch (rule.splitter) 190 { 191 case Splitter.files: 192 result.children = [new Entity(result.contents, null, null)]; 193 return result; 194 case Splitter.lines: 195 result.children = parseToLines(result.contents); 196 return result; 197 case Splitter.words: 198 result.children = parseToWords(result.contents); 199 return result; 200 case Splitter.D: 201 { 202 if (result.contents.startsWith("Ddoc")) 203 goto case Splitter.files; 204 205 DSplitter splitter; 206 if (options.stripComments) 207 result.contents = splitter.stripComments(result.contents); 208 209 final switch (options.mode) 210 { 211 case ParseOptions.Mode.source: 212 result.children = splitter.parse(result.contents); 213 return result; 214 case ParseOptions.Mode.words: 215 result.children = splitter.parseToWords(result.contents); 216 return result; 217 } 218 } 219 case Splitter.diff: 220 result.children = parseDiff(result.contents); 221 return result; 222 } 223 } 224 assert(false); // default * rule should match everything 225 } 226 227 // ***************************************************************************************************************************************************************************** 228 229 /// A simple, error-tolerant D source code splitter. 230 struct DSplitter 231 { 232 struct Pair { string start, end; } 233 static const Pair[] pairs = 234 [ 235 { "{", "}" }, 236 { "[", "]" }, 237 { "(", ")" }, 238 ]; 239 240 static immutable string[] blockKeywords = ["try", "catch", "finally", "while", "do", "in", "out", "body", "if", "static if", "else", "for", "foreach"]; 241 static immutable string[] parenKeywords = ["catch", "while", "if", "static if", "for", "foreach"]; 242 243 /// The order dictates the splitting priority of the separators. 244 static immutable string[][] separators = 245 [ 246 [";", "{"] ~ blockKeywords, 247 ["import"], 248 // From http://wiki.dlang.org/Operator_precedence 249 // Some items are listed twice, DustMite does not distinguish binary/prefix/postfix operators 250 [".."], 251 [","], 252 ["=>"], 253 ["=", "-=", "+=", "<<=", ">>=", ">>>=", "=", "*=", "%=", "^=", "^^=", "~="], 254 ["?", ":"], 255 ["||"], 256 ["&&"], 257 ["|"], 258 ["^"], 259 ["&"], 260 ["==", "!=", ">", "<", ">=", "<=", "!>", "!<", "!>=", "!<=", "<>", "!<>", "<>=", "!<>=", "in", "!in", "is", "!is"], 261 ["<<", ">>", ">>>"], 262 ["+", "-", "~"], 263 ["*", "/", "%"], 264 ["&", "++", "--", "*", "+", "-", /*"!",*/ "~"], 265 ["^^"], 266 [".", "++", "--" /*, "(", "["*/], 267 // "=>", 268 ["!"], 269 ["(", "["] 270 ]; 271 272 enum Token : int 273 { 274 none, 275 end, /// end of stream 276 whitespace, /// spaces, tabs, newlines 277 comment, /// all forms of comments 278 other, /// identifiers, literals and misc. keywords 279 280 generated0, /// First value of generated tokens (see below) 281 282 max = generated0 + tokenLookup.length 283 } 284 285 enum Token[string] tokenLookup = // DMD pr/2824 286 { 287 Token[string] lookup; 288 289 auto t = Token.generated0; 290 Token add(string s) 291 { 292 auto p = s in lookup; 293 if (p) 294 return *p; 295 return lookup[s] = t++; 296 } 297 298 foreach (pair; pairs) 299 { 300 add(pair.start); 301 add(pair.end ); 302 } 303 304 foreach (i, synonyms; separators) 305 foreach (sep; synonyms) 306 add(sep); 307 308 return lookup; 309 }(); 310 311 static immutable string[Token.max] tokenText = 312 { 313 string[Token.max] result; 314 foreach (k, v; tokenLookup) 315 result[v] = k; 316 return result; 317 }(); 318 319 struct TokenPair { Token start, end; } 320 static Token lookupToken(string s) { return tokenLookup[s]; } 321 static TokenPair makeTokenPair(Pair pair) { return TokenPair(tokenLookup[pair.start], tokenLookup[pair.end]); } 322 alias lookupTokens = arrayMap!(lookupToken, const string); 323 static immutable TokenPair[] pairTokens = pairs .arrayMap!makeTokenPair(); 324 static immutable Token[][] separatorTokens = separators.arrayMap!lookupTokens (); 325 static immutable Token[] blockKeywordTokens = blockKeywords.arrayMap!lookupToken(); 326 static immutable Token[] parenKeywordTokens = parenKeywords.arrayMap!lookupToken(); 327 328 enum SeparatorType 329 { 330 none, 331 pair, 332 prefix, 333 postfix, 334 binary, /// infers dependency 335 } 336 337 static SeparatorType getSeparatorType(Token t) 338 { 339 switch (t) 340 { 341 case tokenLookup[";"]: 342 return SeparatorType.postfix; 343 case tokenLookup["import"]: 344 return SeparatorType.prefix; 345 case tokenLookup["else"]: 346 return SeparatorType.binary; 347 default: 348 if (pairTokens.any!(pair => pair.start == t)) 349 return SeparatorType.pair; 350 if (blockKeywordTokens.canFind(t)) 351 return SeparatorType.prefix; 352 if (separatorTokens.any!(row => row.canFind(t))) 353 return SeparatorType.binary; 354 return SeparatorType.none; 355 } 356 } 357 358 // ************************************************************************ 359 360 string s; /// Source code we are splitting 361 size_t i; /// Cursor 362 363 /// Making the end of an input stream a throwable greatly simplifies control flow. 364 class EndOfInput : Throwable { this() { super(null); } } 365 366 /// Are we at the end of the stream? 367 @property bool eos() { return i >= s.length; } 368 369 /// Advances i by n bytes. Throws EndOfInput if there are not enough. 370 string advance(size_t n) 371 { 372 auto e = i + n; 373 if (e > s.length) 374 { 375 i = s.length; 376 throw new EndOfInput; 377 } 378 auto result = s[i..e]; 379 i = e; 380 return result; 381 } 382 383 /// ditto 384 char advance() { return advance(1)[0]; } 385 386 /// If pre comes next, advance i through pre and return it. 387 /// Otherwise, return null. 388 string consume(string pre) 389 { 390 if (s[i..$].startsWith(pre)) 391 return advance(pre.length); 392 else 393 return null; 394 } 395 396 /// ditto 397 char consume(char pre) 398 { 399 assert(pre); 400 if (s[i..$].startsWith(pre)) 401 return advance(); 402 else 403 return 0; 404 } 405 406 /// Peeks at the next n characters. 407 string peek(size_t n) 408 { 409 if (i+n > s.length) 410 throw new EndOfInput; 411 return s[i..i+n]; 412 } 413 414 /// ditto 415 char peek() { return peek(1)[0]; } 416 417 /// Advances i through one token (whether it's a comment, 418 /// a series of whitespace characters, or something else). 419 /// Returns the token type. 420 Token skipTokenOrWS() 421 { 422 if (eos) 423 return Token.end; 424 425 Token result = Token.other; 426 try 427 { 428 auto c = advance(); 429 switch (c) 430 { 431 case '\'': 432 result = Token.other; 433 if (consume('\\')) 434 advance(); 435 while (advance() != '\'') 436 continue; 437 break; 438 case '\\': // D1 naked escaped string 439 result = Token.other; 440 advance(); 441 break; 442 case '"': 443 result = Token.other; 444 while (peek() != '"') 445 { 446 if (advance() == '\\') 447 advance(); 448 } 449 advance(); 450 break; 451 case 'r': 452 if (consume(`r"`)) 453 { 454 result = Token.other; 455 while (advance() != '"') 456 continue; 457 break; 458 } 459 else 460 goto default; 461 case '`': 462 result = Token.other; 463 while (advance() != '`') 464 continue; 465 break; 466 case '/': 467 if (consume('/')) 468 { 469 result = Token.comment; 470 while (peek() != '\r' && peek() != '\n') 471 advance(); 472 } 473 else 474 if (consume('*')) 475 { 476 result = Token.comment; 477 while (!consume("*/")) 478 advance(); 479 } 480 else 481 if (consume('+')) 482 { 483 result = Token.comment; 484 int commentLevel = 1; 485 while (commentLevel) 486 { 487 if (consume("/+")) 488 commentLevel++; 489 else 490 if (consume("+/")) 491 commentLevel--; 492 else 493 advance(); 494 } 495 } 496 else 497 goto default; 498 break; 499 case '@': 500 if (consume("disable") 501 || consume("property") 502 || consume("safe") 503 || consume("trusted") 504 || consume("system") 505 ) 506 return Token.other; 507 goto default; 508 case '#': 509 result = Token.other; 510 do 511 { 512 c = advance(); 513 if (c == '\\') 514 c = advance(); 515 } 516 while (c != '\n'); 517 break; 518 default: 519 { 520 i--; 521 Token best; 522 size_t bestLength; 523 foreach (Token t; Token.init..Token.max) 524 { 525 auto text = tokenText[t]; 526 if (!text) 527 continue; 528 if (!s[i..$].startsWith(text)) 529 continue; 530 if (text[$-1].isAlphaNum() && s.length > i + text.length && s[i + text.length].isAlphaNum()) 531 continue; // if the token is a word, it must end at a word boundary 532 if (text.length <= bestLength) 533 continue; 534 best = t; 535 bestLength = text.length; 536 } 537 if (bestLength) 538 { 539 auto consumed = consume(tokenText[best]); 540 assert(consumed); 541 return best; 542 } 543 544 i++; 545 } 546 if (c.isWhite()) 547 { 548 result = Token.whitespace; 549 while (peek().isWhite()) 550 advance(); 551 } 552 else 553 if (isWordChar(c)) 554 { 555 result = Token.other; 556 while (isWordChar(peek())) 557 advance(); 558 } 559 else 560 result = Token.other; 561 } 562 } 563 catch (EndOfInput) 564 i = s.length; 565 return result; 566 } 567 568 /// Skips leading and trailing whitespace/comments, too. 569 /// Never returns Token.whitespace or Token.comment. 570 void readToken(out Token type, out string span) 571 { 572 size_t spanStart = i; 573 do 574 type = skipTokenOrWS(); 575 while (type == Token.whitespace || type == Token.comment); 576 skipToEOL(); 577 span = s[spanStart..i]; 578 if (type == Token.end && span.length) 579 type = Token.whitespace; 580 } 581 582 /// Moves i forward over first series of EOL characters, 583 /// or until first non-whitespace character, whichever comes first. 584 void skipToEOL() 585 { 586 try 587 while (true) 588 { 589 auto c = peek(); 590 if (c == '\r' || c == '\n') 591 { 592 do 593 advance(), c = peek(); 594 while (c == '\r' || c == '\n'); 595 return; 596 } 597 else 598 if (c.isWhite()) 599 i++; 600 else 601 if (peek(2) == "//") 602 { 603 auto t = skipTokenOrWS(); 604 assert(t == Token.comment); 605 } 606 else 607 break; 608 } 609 catch (EndOfInput) 610 i = s.length; 611 } 612 613 static bool isWordChar(char c) 614 { 615 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'; 616 } 617 618 void reset(string code) 619 { 620 s = code; 621 i = 0; 622 } 623 624 void parseScope(Entity result, Token scopeEnd) 625 { 626 enum Level : int 627 { 628 zero, 629 separator0, 630 separatorMax = separator0 + separators.length - 1, 631 text, 632 max 633 } 634 635 Entity[][Level.max] splitterQueue; 636 637 Entity[] terminateLevel(Level level) 638 { 639 level++; 640 if (level == Level.max) 641 return null; 642 643 auto r = splitterQueue[level] ~ group(terminateLevel(level)); 644 splitterQueue[level] = null; 645 return r; 646 } 647 648 tokenLoop: 649 while (true) 650 { 651 Token token; 652 string span; 653 readToken(token, span); 654 655 if (token == Token.end) 656 { 657 assert(span.empty); 658 break; 659 } 660 if (token == scopeEnd) 661 { 662 result.tail = span; 663 break; 664 } 665 666 auto e = new Entity(); 667 e.token = token; 668 auto level = Level.text; 669 670 Entity after; 671 672 foreach (n, synonyms; separatorTokens) 673 foreach (sep; synonyms) 674 if (token == sep) 675 { 676 level = cast(Level)(Level.separator0 + n); 677 e.children = terminateLevel(level); 678 auto type = getSeparatorType(token); 679 if (type == SeparatorType.prefix || type == SeparatorType.pair) 680 { 681 Entity empty = e; 682 if (e.children.length) 683 { 684 e.token = Token.none; 685 after = empty = new Entity(span); 686 after.token = token; 687 } 688 else 689 e.head = span; 690 691 if (type == SeparatorType.pair) 692 parseScope(empty, pairTokens.find!(pair => pair.start == token).front.end); 693 } 694 else 695 e.tail = span; 696 goto handled; 697 } 698 699 e.head = span; 700 701 handled: 702 splitterQueue[level] ~= e; 703 if (after) 704 splitterQueue[level] ~= after; 705 } 706 707 result.children ~= terminateLevel(Level.zero); 708 } 709 710 Entity[] parse(string code) 711 { 712 reset(code); 713 auto entity = new Entity; 714 parseScope(entity, Token.none); 715 assert(!entity.head && !entity.tail); 716 postProcess(entity.children); 717 return [entity]; 718 } 719 720 Entity[] parseToWords(string code) 721 { 722 reset(code); 723 Entity[] result; 724 while (!eos) 725 { 726 auto start = i; 727 auto token = skipTokenOrWS(); 728 auto span = s[start..i]; 729 730 if (token == Token.other) 731 result ~= new Entity(span); 732 else 733 { 734 if (!result.length) 735 result ~= new Entity(); 736 if (!result[$-1].tail) 737 result[$-1].tail = span; 738 else 739 { 740 start = result[$-1].tail.ptr - s.ptr; 741 result[$-1].tail = s[start..i]; 742 } 743 } 744 } 745 return result; 746 } 747 748 string stripComments(string code) 749 { 750 reset(code); 751 auto result = appender!string(); 752 while (!eos) 753 { 754 auto start = i; 755 Token t = skipTokenOrWS(); 756 if (t != Token.comment) 757 result.put(s[start..i]); 758 } 759 return result.data; 760 } 761 762 static Entity[] group(Entity[] entities) 763 { 764 if (entities.length <= 1) 765 return entities; 766 return [new Entity(null, entities, null)]; 767 } 768 769 static void postProcessSimplify(ref Entity[] entities) 770 { 771 for (size_t i=0; i<entities.length;) 772 { 773 auto e = entities[i]; 774 if (e.head.empty && e.tail.empty && e.dependencies.empty) 775 { 776 assert(e.token == Token.none); 777 if (e.children.length == 0) 778 { 779 entities = entities.remove(i); 780 continue; 781 } 782 else 783 if (e.children.length == 1) 784 { 785 entities[i] = e.children[0]; 786 continue; 787 } 788 } 789 790 i++; 791 } 792 } 793 794 static void postProcessDependency(ref Entity[] entities) 795 { 796 if (entities.length < 2) 797 { 798 foreach (e; entities) 799 postProcessDependency(e.children); 800 return; 801 } 802 803 size_t[] points; 804 foreach_reverse (i, e; entities[0..$-1]) 805 if (getSeparatorType(e.token) == SeparatorType.binary && e.children) 806 points ~= i; 807 808 if (points.length) 809 { 810 auto i = points[$/2]; 811 auto e = entities[i]; 812 813 auto head = entities[0..i] ~ group(e.children); 814 e.children = null; 815 auto tail = new Entity(null, group(entities[i+1..$]), null); 816 e.dependencies ~= tail; 817 entities = group(head ~ e) ~ tail; 818 foreach (c; entities) 819 postProcessDependency(c.children); 820 } 821 } 822 823 static void postProcessTemplates(ref Entity[] entities) 824 { 825 if (!entities.length) 826 return; 827 foreach_reverse (i, e; entities[0..$-1]) 828 if (e.token == tokenLookup["!"] && entities[i+1].children.length && entities[i+1].children[0].token == tokenLookup["("]) 829 { 830 auto dependency = new Entity; 831 e.dependencies ~= dependency; 832 entities[i+1].children[0].dependencies ~= dependency; 833 entities = entities[0..i+1] ~ dependency ~ entities[i+1..$]; 834 } 835 } 836 837 static void postProcessDependencyBlock(ref Entity[] entities) 838 { 839 foreach (i, e; entities) 840 if (i && !e.token && e.children.length && getSeparatorType(e.children[0].token) == SeparatorType.binary && !e.children[0].children) 841 e.children[0].dependencies ~= entities[i-1]; 842 } 843 844 static void postProcessBlockKeywords(ref Entity[] entities) 845 { 846 for (size_t i=0; i<entities.length;) 847 { 848 if (blockKeywordTokens.canFind(entities[i].token) && i+1 < entities.length) 849 { 850 auto j = i + 1; 851 if (j < entities.length && entities[j].token == tokenLookup["("]) 852 j++; 853 j++; // ; or { 854 if (j <= entities.length) 855 { 856 entities = entities[0..i] ~ group(group(entities[i..j-1]) ~ entities[j-1..j]) ~ entities[j..$]; 857 continue; 858 } 859 } 860 861 i++; 862 } 863 } 864 865 static void postProcessBlockStatements(ref Entity[] entities) 866 { 867 for (size_t i=0; i<entities.length;) 868 { 869 auto j = i; 870 bool consume(Token t) 871 { 872 if (j < entities.length 873 && entities[j].children.length == 2 874 && firstToken(entities[j].children[0]) == t) 875 { 876 j++; 877 return true; 878 } 879 return false; 880 } 881 882 if (consume(tokenLookup["if"]) || consume(tokenLookup["static if"])) 883 consume(tokenLookup["else"]); 884 else 885 if (consume(tokenLookup["do"])) 886 consume(tokenLookup["while"]); 887 else 888 if (consume(tokenLookup["try"])) 889 { 890 while (consume(tokenLookup["catch"])) 891 continue; 892 consume(tokenLookup["finally"]); 893 } 894 895 if (i == j) 896 { 897 j++; 898 while (consume(tokenLookup["in"]) || consume(tokenLookup["out"]) || consume(tokenLookup["body"])) 899 continue; 900 } 901 902 if (j-i > 1) 903 { 904 entities = entities[0..i] ~ group(entities[i..j]) ~ entities[j..$]; 905 continue; 906 } 907 908 i++; 909 } 910 } 911 912 static void postProcessPairs(ref Entity[] entities) 913 { 914 size_t lastPair = 0; 915 916 for (size_t i=0; i<entities.length;) 917 { 918 // Create pair entities 919 920 if (entities[i].token == tokenLookup["{"]) 921 { 922 if (i >= lastPair + 1) 923 { 924 lastPair = i-1; 925 entities = entities[0..lastPair] ~ group(group(entities[lastPair..i]) ~ entities[i]) ~ entities[i+1..$]; 926 i = lastPair; 927 entities[i].isPair = true; 928 lastPair++; 929 continue; 930 } 931 else 932 lastPair = i + 1; 933 } 934 else 935 if (entities[i].token == tokenLookup[";"]) 936 lastPair = i + 1; 937 938 i++; 939 } 940 } 941 942 static void postProcessParens(ref Entity[] entities) 943 { 944 for (size_t i=0; i+1 < entities.length;) 945 { 946 if (parenKeywordTokens.canFind(entities[i].token)) 947 { 948 auto pparen = firstHead(entities[i+1]); 949 if (pparen 950 && *pparen !is entities[i+1] 951 && pparen.token == tokenLookup["("]) 952 { 953 auto paren = *pparen; 954 *pparen = new Entity(); 955 entities = entities[0..i] ~ group([entities[i], paren]) ~ entities[i+1..$]; 956 continue; 957 } 958 } 959 960 i++; 961 } 962 963 foreach (e; entities) 964 postProcessParens(e.children); 965 } 966 967 static bool isValidIdentifier(string s) 968 { 969 if (!s.length) 970 return false; 971 if (!isAlpha(s[0])) 972 return false; 973 foreach (c; s[1..$]) 974 if (!isAlphaNum(c)) 975 return false; 976 return true; 977 } 978 979 /// Get all nodes between (exclusively) two addresses. 980 /// If either address is empty, then the respective bound is the respective extreme. 981 static Entity[] nodesBetween(Entity root, size_t[] a, size_t[] b) 982 { 983 while (a.length && b.length && a[0] == b[0]) 984 { 985 root = root.children[a[0]]; 986 a = a[1..$]; 987 b = b[1..$]; 988 } 989 size_t index0, index1; 990 Entity[] children0, children1; 991 if (a.length) 992 { 993 index0 = a[0] + 1; 994 if (a.length > 1) 995 children0 = nodesBetween(root.children[a[0]], a[1..$], null); 996 } 997 else 998 index0 = 0; 999 1000 if (b.length) 1001 { 1002 index1 = b[0]; 1003 if (b.length > 1) 1004 children1 = nodesBetween(root.children[b[0]], null, b[1..$]); 1005 } 1006 else 1007 index1 = root.children.length; 1008 1009 assert(index0 <= index1); 1010 return children0 ~ root.children[index0 .. index1] ~ children1; 1011 } 1012 1013 static void postProcessRecursive(ref Entity[] entities) 1014 { 1015 foreach (e; entities) 1016 if (e.children.length) 1017 postProcessRecursive(e.children); 1018 1019 postProcessSimplify(entities); 1020 postProcessTemplates(entities); 1021 postProcessDependency(entities); 1022 postProcessBlockKeywords(entities); 1023 postProcessDependencyBlock(entities); 1024 postProcessBlockStatements(entities); 1025 postProcessPairs(entities); 1026 postProcessParens(entities); 1027 } 1028 1029 /// Attempt to link together function arguments / parameters for 1030 /// things that look like calls to the same function, to allow removing 1031 /// unused function arguments / parameters. 1032 static void postProcessArgs(ref Entity[] entities) 1033 { 1034 string lastID; 1035 1036 Entity[][][string] calls; 1037 1038 void visit(Entity entity) 1039 { 1040 auto id = entity.head.strip(); 1041 if (entity.token == Token.other && isValidIdentifier(id) && !entity.tail && !entity.children) 1042 lastID = id; 1043 else 1044 if (lastID && entity.token == tokenLookup["("]) 1045 { 1046 size_t[] stack; 1047 struct Comma { size_t[] addr, after; } 1048 Comma[] commas; 1049 1050 bool afterComma; 1051 1052 // Find all top-level commas 1053 void visit2(size_t i, Entity entity) 1054 { 1055 stack ~= i; 1056 if (afterComma) 1057 { 1058 commas[$-1].after = stack; 1059 //entity.comments ~= "After-comma %d".format(commas.length); 1060 afterComma = false; 1061 } 1062 1063 if (entity.token == tokenLookup[","]) 1064 { 1065 commas ~= Comma(stack); 1066 //entity.comments ~= "Comma %d".format(commas.length); 1067 afterComma = true; 1068 } 1069 else 1070 if (entity.head.length || entity.tail.length) 1071 {} 1072 else 1073 foreach (j, child; entity.children) 1074 visit2(j, child); 1075 stack = stack[0..$-1]; 1076 } 1077 1078 foreach (i, child; entity.children) 1079 visit2(i, child); 1080 1081 // Find all nodes between commas, effectively obtaining the arguments 1082 size_t[] last = null; 1083 commas ~= [Comma()]; 1084 Entity[][] args; 1085 foreach (i, comma; commas) 1086 { 1087 //Entity entityAt(Entity root, size_t[] address) { return address.length ? entityAt(root.children[address[0]], address[1..$]) : root; } 1088 //entityAt(entity, last).comments ~= "nodesBetween-left %d".format(i); 1089 //entityAt(entity, comma.after).comments ~= "nodesBetween-right %d".format(i); 1090 args ~= nodesBetween(entity, last, comma.after); 1091 last = comma.addr; 1092 } 1093 1094 // Register the arguments 1095 foreach (i, arg; args) 1096 { 1097 debug 1098 foreach (j, e; arg) 1099 e.comments ~= "%s arg %d node %d".format(lastID, i, j); 1100 1101 if (arg.length == 1) 1102 { 1103 if (lastID !in calls) 1104 calls[lastID] = null; 1105 while (calls[lastID].length < i+1) 1106 calls[lastID] ~= null; 1107 calls[lastID][i] ~= arg[0]; 1108 } 1109 } 1110 1111 lastID = null; 1112 return; 1113 } 1114 else 1115 if (entity.token == tokenLookup["!"]) 1116 {} 1117 else 1118 if (entity.head || entity.tail) 1119 lastID = null; 1120 1121 foreach (child; entity.children) 1122 visit(child); 1123 } 1124 1125 foreach (entity; entities) 1126 visit(entity); 1127 1128 // For each parameter, create a dummy empty node which is a dependency for all of the arguments. 1129 auto callRoot = new Entity(); 1130 debug callRoot.comments ~= "Args root"; 1131 entities ~= callRoot; 1132 1133 foreach (id, params; calls) 1134 { 1135 auto funRoot = new Entity(); 1136 debug funRoot.comments ~= "%s root".format(id); 1137 callRoot.children ~= funRoot; 1138 1139 foreach (i, args; params) 1140 { 1141 auto e = new Entity(); 1142 debug e.comments ~= "%s param %d".format(id, i); 1143 funRoot.children ~= e; 1144 foreach (arg; args) 1145 arg.dependencies ~= e; 1146 } 1147 } 1148 } 1149 1150 static void postProcess(ref Entity[] entities) 1151 { 1152 postProcessRecursive(entities); 1153 postProcessArgs(entities); 1154 } 1155 1156 static Entity* firstHead(ref Entity e) 1157 { 1158 if (e.head.length) 1159 return &e; 1160 foreach (ref c; e.children) 1161 { 1162 auto r = firstHead(c); 1163 if (r) 1164 return r; 1165 } 1166 return null; 1167 } 1168 1169 static Token firstToken(Entity e) 1170 { 1171 while (!e.token && e.children.length) 1172 e = e.children[0]; 1173 return e.token; 1174 } 1175 } 1176 1177 public: 1178 1179 /// Split the text into sequences for each fun is always true, and then always false 1180 Entity[] parseSplit(alias fun)(string text) 1181 { 1182 Entity[] result; 1183 size_t i, wordStart, wordEnd; 1184 for (i = 1; i <= text.length; i++) 1185 if (i==text.length || (fun(text[i-1]) && !fun(text[i]))) 1186 { 1187 if (wordStart != i) 1188 result ~= new Entity(text[wordStart..wordEnd], null, text[wordEnd..i]); 1189 wordStart = wordEnd = i; 1190 } 1191 else 1192 if ((!fun(text[i-1]) && fun(text[i]))) 1193 wordEnd = i; 1194 return result; 1195 } 1196 1197 alias parseToWords = parseSplit!isNotAlphaNum; 1198 alias parseToLines = parseSplit!isNewline; 1199 1200 /// Split s on end~start, preserving end and start on each chunk 1201 private string[] split2(string end, string start)(string s) 1202 { 1203 enum sep = end ~ start; 1204 return split2Impl(s, sep, end.length); 1205 } 1206 1207 private string[] split2Impl(string s, string sep, size_t endLength) 1208 { 1209 string[] result; 1210 while (true) 1211 { 1212 auto i = s.indexOf(sep); 1213 if (i < 0) 1214 return result ~ s; 1215 i += endLength; 1216 result ~= s[0..i]; 1217 s = s[i..$]; 1218 } 1219 } 1220 1221 unittest 1222 { 1223 assert(split2!("]", "[")(null) == [""]); 1224 assert(split2!("]", "[")("[foo]") == ["[foo]"]); 1225 assert(split2!("]", "[")("[foo][bar]") == ["[foo]", "[bar]"]); 1226 assert(split2!("]", "[")("[foo] [bar]") == ["[foo] [bar]"]); 1227 } 1228 1229 Entity[] parseDiff(string s) 1230 { 1231 return s 1232 .split2!("\n", "diff ") 1233 .map!( 1234 (string file) 1235 { 1236 auto chunks = file.split2!("\n", "@@ "); 1237 return new Entity(chunks[0], chunks[1..$].map!(chunk => new Entity(chunk)).array); 1238 } 1239 ) 1240 .array 1241 ; 1242 } 1243 1244 private: 1245 1246 bool isNewline(char c) { return c == '\r' || c == '\n'; } 1247 alias isNotAlphaNum = not!isAlphaNum; 1248 1249 // https://d.puremagic.com/issues/show_bug.cgi?id=11824 1250 auto arrayMap(alias fun, T)(T[] arr) 1251 { 1252 alias R = typeof(fun(arr[0])); 1253 auto result = new R[arr.length]; 1254 foreach (i, v; arr) 1255 result[i] = fun(v); 1256 return result; 1257 }