1 /// Simple source code splitter 2 /// Written by Vladimir Panteleev <vladimir@thecybershadow.net> 3 /// License: Boost Software License, Version 1.0 4 5 module splitter; 6 7 import std.ascii; 8 import std.algorithm; 9 import std.array; 10 import std.conv; 11 import std.exception; 12 import std.file; 13 import std.functional; 14 import std.path; 15 import std.range; 16 import std.string; 17 import std.traits; 18 import std.stdio : stderr; 19 20 import polyhash; 21 22 /// Represents an Entity's position within a program tree. 23 struct Address 24 { 25 Address* parent; /// Upper node's Address. If null, then this is the root node (and index should be 0). 26 size_t index; /// Index within the parent's children array 27 size_t depth; /// Distance from the root address 28 29 Address*[] children; /// Used to keep a global cached tree of addresses. 30 ref Address* child(size_t index) const 31 { 32 auto mutableThis = cast(Address*)&this; // Break const for caching 33 if (mutableThis.children.length < index + 1) 34 mutableThis.children.length = index + 1; 35 if (!mutableThis.children[index]) 36 mutableThis.children[index] = new Address(mutableThis, index, depth+1); 37 return mutableThis.children[index]; 38 } 39 } 40 41 struct EntityRef /// Reference to another Entity in the same tree 42 { 43 Entity entity; /// Pointer - only valid during splitting / optimizing 44 const(Address)* address; /// Address - assigned after splitting / optimizing 45 } 46 47 enum largest64bitPrime = 18446744073709551557UL; // 0xFFFFFFFF_FFFFFFC5 48 // static if (is(ModQ!(ulong, largest64bitPrime))) 49 static if (modQSupported) // https://issues.dlang.org/show_bug.cgi?id=20677 50 alias EntityHash = PolynomialHash!(ModQ!(ulong, largest64bitPrime)); 51 else 52 { 53 pragma(msg, 54 "64-bit long multiplication/division is not supported on this platform.\n" ~ 55 "Falling back to working in modulo 2^^64.\n" ~ 56 "Hashing / cache accuracy may be impaired.\n" ~ 57 "---------------------------------------------------------------------"); 58 alias EntityHash = PolynomialHash!ulong; 59 } 60 61 /// Represents a slice of the original code. 62 final class Entity 63 { 64 string head; /// This node's "head", e.g. "{" for a statement block. 65 Entity[] children; /// This node's children nodes, e.g. the statements of the statement block. 66 string tail; /// This node's "tail", e.g. "}" for a statement block. 67 68 string filename, contents; 69 @property bool isFile() { return filename != ""; } 70 71 bool isPair; /// Internal hint for --dump output 72 bool noRemove; /// Don't try removing this entity (children OK) 73 bool clean; /// Computed fields are up-to-date 74 75 bool dead; /// Tombstone or redirect 76 EntityRef[] dependents;/// If this entity is removed, so should all these entities. 77 Address* redirect; /// If moved, this is where this entity is now 78 79 int id; /// For diagnostics 80 size_t descendants; /// [Computed] For progress display 81 EntityHash hash; /// [Computed] Hashed value of this entity's content (as if it were saved to disk). 82 const(Address)*[] allDependents; /// [Computed] External dependents of this and child nodes 83 string deadContents; /// [Computed] For --white-out - all of this node's contents, with non-whitespace replaced by whitespace 84 EntityHash deadHash; /// [Computed] Hash of deadContents 85 86 this(string head = null, Entity[] children = null, string tail = null) 87 { 88 this.head = head; 89 this.children = children; 90 this.tail = tail; 91 } 92 93 @property string comment() 94 { 95 string[] result; 96 debug result = comments; 97 if (isPair) 98 { 99 assert(token == DSplitter.Token.none); 100 result ~= "Pair"; 101 } 102 if (token && DSplitter.tokenText[token]) 103 result ~= DSplitter.tokenText[token]; 104 return result.length ? result.join(" / ") : null; 105 } 106 107 override string toString() const 108 { 109 return "%(%s%) %s %(%s%)".format([head], children, [tail]); 110 } 111 112 Entity dup() /// Creates a shallow copy 113 { 114 auto result = new Entity; 115 foreach (i, item; this.tupleof) 116 result.tupleof[i] = this.tupleof[i]; 117 result.children = result.children.dup; 118 return result; 119 } 120 121 void kill() /// Convert to tombstone/redirect 122 { 123 dependents = null; 124 isPair = false; 125 descendants = 0; 126 allDependents = null; 127 dead = true; 128 } 129 130 private: // Used during parsing only 131 DSplitter.Token token; /// Used internally 132 133 debug string[] comments; /// Used to debug the splitter 134 } 135 136 enum Mode 137 { 138 source, 139 words, /// split identifiers, for obfuscation 140 } 141 142 enum Splitter 143 { 144 files, /// Load entire files only 145 lines, /// Split by line ends 146 words, /// Split by whitespace 147 D, /// Parse D source code 148 diff, /// Unified diffs 149 indent, /// Indentation (Python, YAML...) 150 } 151 immutable string[] splitterNames = [EnumMembers!Splitter].map!(e => e.text().toLower()).array(); 152 153 struct ParseRule 154 { 155 string pattern; 156 Splitter splitter; 157 } 158 159 struct ParseOptions 160 { 161 enum Mode { source, words } 162 163 bool stripComments; 164 ParseRule[] rules; 165 Mode mode; 166 uint tabWidth; 167 } 168 169 /// Parse the given file/directory. 170 /// For files, modifies path to be the base name for .test / .reduced directories. 171 Entity loadFiles(ref string path, ParseOptions options) 172 { 173 if (isFile(path)) 174 { 175 auto filePath = path; 176 path = stripExtension(path); 177 return loadFile(filePath.baseName(), filePath, options); 178 } 179 else 180 { 181 auto set = new Entity(); 182 foreach (string entry; dirEntries(path, SpanMode.breadth).array.sort!((a, b) => a.name < b.name)) 183 if (isFile(entry)) 184 { 185 assert(entry.startsWith(path)); 186 auto name = entry[path.length+1..$]; 187 set.children ~= loadFile(name, entry, options); 188 } 189 return set; 190 } 191 } 192 193 enum BIN_SIZE = 2; 194 195 void optimizeUntil(alias stop)(Entity set) 196 { 197 static Entity group(Entity[] children) 198 { 199 if (children.length == 1) 200 return children[0]; 201 auto e = new Entity(null, children, null); 202 e.noRemove = children.any!(c => c.noRemove)(); 203 return e; 204 } 205 206 static void clusterBy(ref Entity[] set, size_t binSize) 207 { 208 while (set.length > binSize) 209 { 210 auto size = set.length >= binSize*2 ? binSize : (set.length+1) / 2; 211 //auto size = binSize; 212 213 set = set.chunks(size).map!group.array; 214 } 215 } 216 217 void doOptimize(Entity e) 218 { 219 if (stop(e)) 220 return; 221 foreach (c; e.children) 222 doOptimize(c); 223 clusterBy(e.children, BIN_SIZE); 224 } 225 226 doOptimize(set); 227 } 228 229 alias optimize = optimizeUntil!((Entity e) => false); 230 231 private: 232 233 /// Override std.string nonsense, which does UTF-8 decoding 234 bool startsWith(in char[] big, in char[] small) { return big.length >= small.length && big[0..small.length] == small; } 235 bool startsWith(in char[] big, char c) { return big.length && big[0] == c; } 236 string strip(string s) { while (s.length && isWhite(s[0])) s = s[1..$]; while (s.length && isWhite(s[$-1])) s = s[0..$-1]; return s; } 237 238 immutable ParseRule[] defaultRules = 239 [ 240 { "*.d" , Splitter.D }, 241 { "*.di" , Splitter.D }, 242 { "*.diff" , Splitter.diff }, 243 { "*.patch", Splitter.diff }, 244 { "*" , Splitter.files }, 245 ]; 246 247 Entity loadFile(string name, string path, ParseOptions options) 248 { 249 stderr.writeln("Loading ", path); 250 auto result = new Entity(); 251 result.filename = name.replace(`\`, `/`); 252 result.contents = cast(string)read(path); 253 254 auto base = name.baseName(); 255 foreach (rule; chain(options.rules, defaultRules)) 256 if (base.globMatch(rule.pattern)) 257 { 258 final switch (rule.splitter) 259 { 260 case Splitter.files: 261 result.children = [new Entity(result.contents, null, null)]; 262 return result; 263 case Splitter.lines: 264 result.children = parseToLines(result.contents); 265 return result; 266 case Splitter.words: 267 result.children = parseToWords(result.contents); 268 return result; 269 case Splitter.D: 270 { 271 if (result.contents.startsWith("Ddoc")) 272 goto case Splitter.files; 273 274 DSplitter splitter; 275 if (options.stripComments) 276 result.contents = splitter.stripComments(result.contents); 277 278 final switch (options.mode) 279 { 280 case ParseOptions.Mode.source: 281 result.children = splitter.parse(result.contents); 282 return result; 283 case ParseOptions.Mode.words: 284 result.children = splitter.parseToWords(result.contents); 285 return result; 286 } 287 } 288 case Splitter.diff: 289 result.children = parseDiff(result.contents); 290 return result; 291 case Splitter.indent: 292 result.children = parseIndent(result.contents, options.tabWidth); 293 return result; 294 } 295 } 296 assert(false); // default * rule should match everything 297 } 298 299 // ***************************************************************************************************************************************************************************** 300 301 /// A simple, error-tolerant D source code splitter. 302 struct DSplitter 303 { 304 struct Pair { string start, end; } 305 static const Pair[] pairs = 306 [ 307 { "{", "}" }, 308 { "[", "]" }, 309 { "(", ")" }, 310 ]; 311 312 static immutable string[] blockKeywords = ["try", "catch", "finally", "while", "do", "in", "out", "body", "if", "static if", "else", "for", "foreach"]; 313 static immutable string[] parenKeywords = ["catch", "while", "if", "static if", "for", "foreach"]; 314 315 /// The order dictates the splitting priority of the separators. 316 static immutable string[][] separators = 317 [ 318 [";", "{"] ~ blockKeywords, 319 ["import"], 320 // From http://wiki.dlang.org/Operator_precedence 321 // Some items are listed twice, DustMite does not distinguish binary/prefix/postfix operators 322 [".."], 323 [","], 324 ["=>"], 325 ["=", "-=", "+=", "<<=", ">>=", ">>>=", "=", "*=", "%=", "^=", "^^=", "~="], 326 ["?", ":"], 327 ["||"], 328 ["&&"], 329 ["|"], 330 ["^"], 331 ["&"], 332 ["==", "!=", ">", "<", ">=", "<=", "!>", "!<", "!>=", "!<=", "<>", "!<>", "<>=", "!<>=", "in", "!in", "is", "!is"], 333 ["<<", ">>", ">>>"], 334 ["+", "-", "~"], 335 ["*", "/", "%"], 336 ["&", "++", "--", "*", "+", "-", /*"!",*/ "~"], 337 ["^^"], 338 [".", "++", "--" /*, "(", "["*/], 339 // "=>", 340 ["!"], 341 ["(", "["] 342 ]; 343 344 enum Token : int 345 { 346 none, 347 end, /// end of stream 348 whitespace, /// spaces, tabs, newlines 349 comment, /// all forms of comments 350 other, /// identifiers, literals and misc. keywords 351 352 generated0, /// First value of generated tokens (see below) 353 354 max = tokenText.length 355 } 356 357 static immutable string[] tokenText = 358 { 359 auto result = new string[Token.generated0]; 360 Token[string] lookup; 361 362 Token add(string s) 363 { 364 auto p = s in lookup; 365 if (p) 366 return *p; 367 368 Token t = cast(Token)result.length; 369 result ~= s; 370 lookup[s] = t; 371 return t; 372 } 373 374 foreach (pair; pairs) 375 { 376 add(pair.start); 377 add(pair.end ); 378 } 379 380 foreach (i, synonyms; separators) 381 foreach (sep; synonyms) 382 add(sep); 383 return result; 384 }(); 385 386 static Token lookupToken(string s) 387 { 388 if (!__ctfe) assert(false, "Don't use at runtime"); 389 foreach (t; Token.generated0 .. Token.max) 390 if (s == tokenText[t]) 391 return t; 392 assert(false, "No such token: " ~ s); 393 } 394 enum Token tokenLookup(string s) = lookupToken(s); 395 396 struct TokenPair { Token start, end; } 397 static TokenPair makeTokenPair(Pair pair) { return TokenPair(lookupToken(pair.start), lookupToken(pair.end)); } 398 alias lookupTokens = arrayMap!(lookupToken, const string); 399 static immutable TokenPair[] pairTokens = pairs .arrayMap!makeTokenPair(); 400 static immutable Token[][] separatorTokens = separators.arrayMap!lookupTokens (); 401 static immutable Token[] blockKeywordTokens = blockKeywords.arrayMap!lookupToken(); 402 static immutable Token[] parenKeywordTokens = parenKeywords.arrayMap!lookupToken(); 403 404 enum SeparatorType 405 { 406 none, 407 pair, 408 prefix, 409 postfix, 410 binary, /// infers dependency 411 } 412 413 static SeparatorType getSeparatorType(Token t) 414 { 415 switch (t) 416 { 417 case tokenLookup!";": 418 return SeparatorType.postfix; 419 case tokenLookup!"import": 420 return SeparatorType.prefix; 421 case tokenLookup!"else": 422 return SeparatorType.binary; 423 default: 424 if (pairTokens.any!(pair => pair.start == t)) 425 return SeparatorType.pair; 426 if (blockKeywordTokens.canFind(t)) 427 return SeparatorType.prefix; 428 if (separatorTokens.any!(row => row.canFind(t))) 429 return SeparatorType.binary; 430 return SeparatorType.none; 431 } 432 } 433 434 // ************************************************************************ 435 436 string s; /// Source code we are splitting 437 size_t i; /// Cursor 438 439 /// Making the end of an input stream a throwable greatly simplifies control flow. 440 class EndOfInput : Throwable { this() { super(null); } } 441 442 /// Are we at the end of the stream? 443 @property bool eos() { return i >= s.length; } 444 445 /// Advances i by n bytes. Throws EndOfInput if there are not enough. 446 string advance(size_t n) 447 { 448 auto e = i + n; 449 if (e > s.length) 450 { 451 i = s.length; 452 throw new EndOfInput; 453 } 454 auto result = s[i..e]; 455 i = e; 456 return result; 457 } 458 459 /// ditto 460 char advance() { return advance(1)[0]; } 461 462 /// If pre comes next, advance i through pre and return it. 463 /// Otherwise, return null. 464 string consume(string pre) 465 { 466 if (s[i..$].startsWith(pre)) 467 return advance(pre.length); 468 else 469 return null; 470 } 471 472 /// ditto 473 char consume(char pre) 474 { 475 assert(pre); 476 if (s[i..$].startsWith(pre)) 477 return advance(); 478 else 479 return 0; 480 } 481 482 /// Peeks at the next n characters. 483 string peek(size_t n) 484 { 485 if (i+n > s.length) 486 throw new EndOfInput; 487 return s[i..i+n]; 488 } 489 490 /// ditto 491 char peek() { return peek(1)[0]; } 492 493 /// Advances i through one token (whether it's a comment, 494 /// a series of whitespace characters, or something else). 495 /// Returns the token type. 496 Token skipTokenOrWS() 497 { 498 if (eos) 499 return Token.end; 500 501 Token result = Token.other; 502 try 503 { 504 auto c = advance(); 505 switch (c) 506 { 507 case '\'': 508 result = Token.other; 509 if (consume('\\')) 510 advance(); 511 while (advance() != '\'') 512 continue; 513 break; 514 case '\\': // D1 naked escaped string 515 result = Token.other; 516 advance(); 517 break; 518 case '"': 519 result = Token.other; 520 while (peek() != '"') 521 { 522 if (advance() == '\\') 523 advance(); 524 } 525 advance(); 526 break; 527 case 'r': 528 if (consume(`"`)) 529 { 530 result = Token.other; 531 while (advance() != '"') 532 continue; 533 break; 534 } 535 else 536 goto default; 537 case '`': 538 result = Token.other; 539 while (advance() != '`') 540 continue; 541 break; 542 case '/': 543 if (consume('/')) 544 { 545 result = Token.comment; 546 while (peek() != '\r' && peek() != '\n') 547 advance(); 548 } 549 else 550 if (consume('*')) 551 { 552 result = Token.comment; 553 while (!consume("*/")) 554 advance(); 555 } 556 else 557 if (consume('+')) 558 { 559 result = Token.comment; 560 int commentLevel = 1; 561 while (commentLevel) 562 { 563 if (consume("/+")) 564 commentLevel++; 565 else 566 if (consume("+/")) 567 commentLevel--; 568 else 569 advance(); 570 } 571 } 572 else 573 goto default; 574 break; 575 case '@': 576 if (consume("disable") 577 || consume("property") 578 || consume("safe") 579 || consume("trusted") 580 || consume("system") 581 ) 582 return Token.other; 583 goto default; 584 case '#': 585 result = Token.other; 586 do 587 { 588 c = advance(); 589 if (c == '\\') 590 c = advance(); 591 } 592 while (c != '\n'); 593 break; 594 default: 595 { 596 i--; 597 Token best; 598 size_t bestLength; 599 foreach (Token t; Token.init..Token.max) 600 { 601 auto text = tokenText[t]; 602 if (!text) 603 continue; 604 if (!s[i..$].startsWith(text)) 605 continue; 606 if (text[$-1].isAlphaNum() && s.length > i + text.length && s[i + text.length].isAlphaNum()) 607 continue; // if the token is a word, it must end at a word boundary 608 if (text.length <= bestLength) 609 continue; 610 best = t; 611 bestLength = text.length; 612 } 613 if (bestLength) 614 { 615 auto consumed = consume(tokenText[best]); 616 assert(consumed); 617 return best; 618 } 619 620 i++; 621 } 622 if (c.isWhite()) 623 { 624 result = Token.whitespace; 625 while (peek().isWhite()) 626 advance(); 627 } 628 else 629 if (isWordChar(c)) 630 { 631 result = Token.other; 632 while (isWordChar(peek())) 633 advance(); 634 } 635 else 636 result = Token.other; 637 } 638 } 639 catch (EndOfInput) 640 i = s.length; 641 return result; 642 } 643 644 /// Skips leading and trailing whitespace/comments, too. 645 /// Never returns Token.whitespace or Token.comment. 646 void readToken(out Token type, out string span) 647 { 648 size_t spanStart = i; 649 do 650 type = skipTokenOrWS(); 651 while (type == Token.whitespace || type == Token.comment); 652 skipToEOL(); 653 span = s[spanStart..i]; 654 if (type == Token.end && span.length) 655 type = Token.whitespace; 656 } 657 658 /// Moves i forward over first series of EOL characters, 659 /// or until first non-whitespace character, whichever comes first. 660 void skipToEOL() 661 { 662 try 663 while (true) 664 { 665 auto c = peek(); 666 if (c == '\r' || c == '\n') 667 { 668 do 669 advance(), c = peek(); 670 while (c == '\r' || c == '\n'); 671 return; 672 } 673 else 674 if (c.isWhite()) 675 i++; 676 else 677 if (peek(2) == "//") 678 { 679 auto t = skipTokenOrWS(); 680 assert(t == Token.comment); 681 } 682 else 683 break; 684 } 685 catch (EndOfInput) 686 i = s.length; 687 } 688 689 static bool isWordChar(char c) 690 { 691 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'; 692 } 693 694 void reset(string code) 695 { 696 s = code; 697 i = 0; 698 } 699 700 void parseScope(Entity result, Token scopeEnd) 701 { 702 enum Level : int 703 { 704 zero, 705 separator0, 706 separatorMax = separator0 + separators.length - 1, 707 text, 708 max 709 } 710 711 Entity[][Level.max] splitterQueue; 712 713 Entity[] terminateLevel(Level level) 714 { 715 level++; 716 if (level == Level.max) 717 return null; 718 719 auto r = splitterQueue[level] ~ group(terminateLevel(level)); 720 splitterQueue[level] = null; 721 return r; 722 } 723 724 while (true) 725 { 726 Token token; 727 string span; 728 readToken(token, span); 729 730 if (token == Token.end) 731 { 732 assert(span.empty); 733 break; 734 } 735 if (token == scopeEnd) 736 { 737 result.tail = span; 738 break; 739 } 740 741 auto e = new Entity(); 742 e.token = token; 743 auto level = Level.text; 744 745 Entity after; 746 747 foreach (n, synonyms; separatorTokens) 748 foreach (sep; synonyms) 749 if (token == sep) 750 { 751 level = cast(Level)(Level.separator0 + n); 752 e.children = terminateLevel(level); 753 auto type = getSeparatorType(token); 754 if (type == SeparatorType.prefix || type == SeparatorType.pair) 755 { 756 Entity empty = e; 757 if (e.children.length) 758 { 759 e.token = Token.none; 760 after = empty = new Entity(span); 761 after.token = token; 762 } 763 else 764 e.head = span; 765 766 if (type == SeparatorType.pair) 767 parseScope(empty, pairTokens.find!(pair => pair.start == token).front.end); 768 } 769 else 770 e.tail = span; 771 goto handled; 772 } 773 774 e.head = span; 775 776 handled: 777 splitterQueue[level] ~= e; 778 if (after) 779 splitterQueue[level] ~= after; 780 } 781 782 result.children ~= terminateLevel(Level.zero); 783 } 784 785 Entity[] parse(string code) 786 { 787 reset(code); 788 auto entity = new Entity; 789 parseScope(entity, Token.none); 790 assert(!entity.head && !entity.tail); 791 postProcess(entity.children); 792 return [entity]; 793 } 794 795 Entity[] parseToWords(string code) 796 { 797 reset(code); 798 Entity[] result; 799 while (!eos) 800 { 801 auto start = i; 802 auto token = skipTokenOrWS(); 803 auto span = s[start..i]; 804 805 if (token == Token.other) 806 result ~= new Entity(span); 807 else 808 { 809 if (!result.length) 810 result ~= new Entity(); 811 if (!result[$-1].tail) 812 result[$-1].tail = span; 813 else 814 { 815 start = result[$-1].tail.ptr - s.ptr; 816 result[$-1].tail = s[start..i]; 817 } 818 } 819 } 820 return result; 821 } 822 823 string stripComments(string code) 824 { 825 reset(code); 826 auto result = appender!string(); 827 while (!eos) 828 { 829 auto start = i; 830 Token t = skipTokenOrWS(); 831 if (t != Token.comment) 832 result.put(s[start..i]); 833 } 834 return result.data; 835 } 836 837 static Entity[] group(Entity[] entities) 838 { 839 if (entities.length <= 1) 840 return entities; 841 return [new Entity(null, entities, null)]; 842 } 843 844 static void postProcessSimplify(ref Entity[] entities) 845 { 846 for (size_t i=0; i<entities.length;) 847 { 848 auto e = entities[i]; 849 if (e.head.empty && e.tail.empty && e.dependents.empty) 850 { 851 assert(e.token == Token.none); 852 if (e.children.length == 0) 853 { 854 entities = entities.remove(i); 855 continue; 856 } 857 else 858 if (e.children.length == 1) 859 { 860 entities[i] = e.children[0]; 861 continue; 862 } 863 } 864 865 i++; 866 } 867 } 868 869 static void postProcessDependency(ref Entity[] entities) 870 { 871 if (entities.length < 2) 872 { 873 foreach (e; entities) 874 postProcessDependency(e.children); 875 return; 876 } 877 878 size_t[] points; 879 foreach_reverse (i, e; entities[0..$-1]) 880 if (getSeparatorType(e.token) == SeparatorType.binary && e.children) 881 points ~= i; 882 883 if (points.length) 884 { 885 auto i = points[$/2]; 886 auto e = entities[i]; 887 888 auto head = entities[0..i] ~ group(e.children); 889 e.children = null; 890 auto tail = new Entity(null, group(entities[i+1..$]), null); 891 tail.dependents ~= EntityRef(e); 892 entities = group(head ~ e) ~ tail; 893 foreach (c; entities) 894 postProcessDependency(c.children); 895 } 896 } 897 898 static void postProcessTemplates(ref Entity[] entities) 899 { 900 if (!entities.length) 901 return; 902 foreach_reverse (i, e; entities[0..$-1]) 903 if (e.token == tokenLookup!"!" && entities[i+1].children.length && entities[i+1].children[0].token == tokenLookup!"(") 904 { 905 auto dependency = new Entity; 906 dependency.dependents = [EntityRef(e), EntityRef(entities[i+1].children[0])]; 907 entities = entities[0..i+1] ~ dependency ~ entities[i+1..$]; 908 } 909 } 910 911 static void postProcessDependencyBlock(ref Entity[] entities) 912 { 913 foreach (i, e; entities) 914 if (i && !e.token && e.children.length && getSeparatorType(e.children[0].token) == SeparatorType.binary && !e.children[0].children) 915 entities[i-1].dependents ~= EntityRef(e.children[0]); 916 } 917 918 static void postProcessBlockKeywords(ref Entity[] entities) 919 { 920 foreach_reverse (i; 0 .. entities.length) 921 { 922 if (blockKeywordTokens.canFind(entities[i].token) && i+1 < entities.length) 923 { 924 auto j = i + 1; 925 if (j < entities.length && entities[j].token == tokenLookup!"(") 926 j++; 927 j++; // ; or { 928 if (j <= entities.length) 929 entities = entities[0..i] ~ group(group(entities[i..j-1]) ~ entities[j-1..j]) ~ entities[j..$]; 930 } 931 } 932 } 933 934 static void postProcessBlockStatements(ref Entity[] entities) 935 { 936 for (size_t i=0; i<entities.length;) 937 { 938 auto j = i; 939 bool consume(Token t) 940 { 941 if (j < entities.length 942 && entities[j].children.length == 2 943 && firstToken(entities[j].children[0]) == t) 944 { 945 j++; 946 return true; 947 } 948 return false; 949 } 950 951 if (consume(tokenLookup!"if") || consume(tokenLookup!"static if")) 952 consume(tokenLookup!"else"); 953 else 954 if (consume(tokenLookup!"do")) 955 consume(tokenLookup!"while"); 956 else 957 if (consume(tokenLookup!"try")) 958 { 959 while (consume(tokenLookup!"catch")) 960 continue; 961 consume(tokenLookup!"finally"); 962 } 963 964 if (i == j) 965 { 966 j++; 967 while (consume(tokenLookup!"in") || consume(tokenLookup!"out") || consume(tokenLookup!"body")) 968 continue; 969 } 970 971 if (j-i > 1) 972 { 973 entities = entities[0..i] ~ group(entities[i..j]) ~ entities[j..$]; 974 continue; 975 } 976 977 i++; 978 } 979 } 980 981 static void postProcessPairs(ref Entity[] entities) 982 { 983 size_t lastPair = 0; 984 985 for (size_t i=0; i<entities.length;) 986 { 987 // Create pair entities 988 989 if (entities[i].token == tokenLookup!"{") 990 { 991 if (i >= lastPair + 1) 992 { 993 lastPair = i-1; 994 entities = entities[0..lastPair] ~ group(group(entities[lastPair..i]) ~ entities[i]) ~ entities[i+1..$]; 995 i = lastPair; 996 entities[i].isPair = true; 997 lastPair++; 998 continue; 999 } 1000 else 1001 lastPair = i + 1; 1002 } 1003 else 1004 if (entities[i].token == tokenLookup!";") 1005 lastPair = i + 1; 1006 1007 i++; 1008 } 1009 } 1010 1011 static void postProcessParens(ref Entity[] entities) 1012 { 1013 for (size_t i=0; i+1 < entities.length;) 1014 { 1015 if (parenKeywordTokens.canFind(entities[i].token)) 1016 { 1017 auto pparen = firstHead(entities[i+1]); 1018 if (pparen 1019 && *pparen !is entities[i+1] 1020 && pparen.token == tokenLookup!"(") 1021 { 1022 auto paren = *pparen; 1023 *pparen = new Entity(); 1024 entities = entities[0..i] ~ group([entities[i], paren]) ~ entities[i+1..$]; 1025 continue; 1026 } 1027 } 1028 1029 i++; 1030 } 1031 1032 foreach (e; entities) 1033 postProcessParens(e.children); 1034 } 1035 1036 static bool isValidIdentifier(string s) 1037 { 1038 if (!s.length) 1039 return false; 1040 if (!isAlpha(s[0])) 1041 return false; 1042 foreach (c; s[1..$]) 1043 if (!isAlphaNum(c)) 1044 return false; 1045 return true; 1046 } 1047 1048 /// Get all nodes between (exclusively) two addresses. 1049 /// If either address is empty, then the respective bound is the respective extreme. 1050 static Entity[] nodesBetween(Entity root, size_t[] a, size_t[] b) 1051 { 1052 while (a.length && b.length && a[0] == b[0]) 1053 { 1054 root = root.children[a[0]]; 1055 a = a[1..$]; 1056 b = b[1..$]; 1057 } 1058 size_t index0, index1; 1059 Entity[] children0, children1; 1060 if (a.length) 1061 { 1062 index0 = a[0] + 1; 1063 if (a.length > 1) 1064 children0 = nodesBetween(root.children[a[0]], a[1..$], null); 1065 } 1066 else 1067 index0 = 0; 1068 1069 if (b.length) 1070 { 1071 index1 = b[0]; 1072 if (b.length > 1) 1073 children1 = nodesBetween(root.children[b[0]], null, b[1..$]); 1074 } 1075 else 1076 index1 = root.children.length; 1077 1078 assert(index0 <= index1); 1079 return children0 ~ root.children[index0 .. index1] ~ children1; 1080 } 1081 1082 static void postProcessRecursive(ref Entity[] entities) 1083 { 1084 foreach (e; entities) 1085 if (e.children.length) 1086 postProcessRecursive(e.children); 1087 1088 postProcessSimplify(entities); 1089 postProcessTemplates(entities); 1090 postProcessDependency(entities); 1091 postProcessBlockKeywords(entities); 1092 postProcessDependencyBlock(entities); 1093 postProcessBlockStatements(entities); 1094 postProcessPairs(entities); 1095 postProcessParens(entities); 1096 } 1097 1098 /// Attempt to link together function arguments / parameters for 1099 /// things that look like calls to the same function, to allow removing 1100 /// unused function arguments / parameters. 1101 static void postProcessArgs(ref Entity[] entities) 1102 { 1103 string lastID; 1104 1105 Entity[][][string] calls; 1106 1107 void visit(Entity entity) 1108 { 1109 auto id = entity.head.strip(); 1110 if (entity.token == Token.other && isValidIdentifier(id) && !entity.tail && !entity.children) 1111 lastID = id; 1112 else 1113 if (lastID && entity.token == tokenLookup!"(") 1114 { 1115 size_t[] stack; 1116 struct Comma { size_t[] addr, after; } 1117 Comma[] commas; 1118 1119 bool afterComma; 1120 1121 // Find all top-level commas 1122 void visit2(size_t i, Entity entity) 1123 { 1124 stack ~= i; 1125 if (afterComma) 1126 { 1127 commas[$-1].after = stack; 1128 //entity.comments ~= "After-comma %d".format(commas.length); 1129 afterComma = false; 1130 } 1131 1132 if (entity.token == tokenLookup!",") 1133 { 1134 commas ~= Comma(stack); 1135 //entity.comments ~= "Comma %d".format(commas.length); 1136 afterComma = true; 1137 } 1138 else 1139 if (entity.head.length || entity.tail.length) 1140 {} 1141 else 1142 foreach (j, child; entity.children) 1143 visit2(j, child); 1144 stack = stack[0..$-1]; 1145 } 1146 1147 foreach (i, child; entity.children) 1148 visit2(i, child); 1149 1150 // Find all nodes between commas, effectively obtaining the arguments 1151 size_t[] last = null; 1152 commas ~= [Comma()]; 1153 Entity[][] args; 1154 foreach (i, comma; commas) 1155 { 1156 //Entity entityAt(Entity root, size_t[] address) { return address.length ? entityAt(root.children[address[0]], address[1..$]) : root; } 1157 //entityAt(entity, last).comments ~= "nodesBetween-left %d".format(i); 1158 //entityAt(entity, comma.after).comments ~= "nodesBetween-right %d".format(i); 1159 args ~= nodesBetween(entity, last, comma.after); 1160 last = comma.addr; 1161 } 1162 1163 // Register the arguments 1164 foreach (i, arg; args) 1165 { 1166 debug 1167 foreach (j, e; arg) 1168 e.comments ~= "%s arg %d node %d".format(lastID, i, j); 1169 1170 if (arg.length == 1) 1171 { 1172 if (lastID !in calls) 1173 calls[lastID] = null; 1174 while (calls[lastID].length < i+1) 1175 calls[lastID] ~= null; 1176 calls[lastID][i] ~= arg[0]; 1177 } 1178 } 1179 1180 lastID = null; 1181 return; 1182 } 1183 else 1184 if (entity.token == tokenLookup!"!") 1185 {} 1186 else 1187 if (entity.head || entity.tail) 1188 lastID = null; 1189 1190 foreach (child; entity.children) 1191 visit(child); 1192 } 1193 1194 foreach (entity; entities) 1195 visit(entity); 1196 1197 // For each parameter, create a dummy empty node which is a dependency for all of the arguments. 1198 auto callRoot = new Entity(); 1199 debug callRoot.comments ~= "Args root"; 1200 entities ~= callRoot; 1201 1202 foreach (id; calls.keys.sort()) 1203 { 1204 auto funRoot = new Entity(); 1205 debug funRoot.comments ~= "%s root".format(id); 1206 callRoot.children ~= funRoot; 1207 1208 foreach (i, args; calls[id]) 1209 { 1210 auto e = new Entity(); 1211 debug e.comments ~= "%s param %d".format(id, i); 1212 funRoot.children ~= e; 1213 foreach (arg; args) 1214 e.dependents ~= EntityRef(arg); 1215 } 1216 } 1217 } 1218 1219 static void postProcess(ref Entity[] entities) 1220 { 1221 postProcessRecursive(entities); 1222 postProcessArgs(entities); 1223 } 1224 1225 static Entity* firstHead(ref return Entity e) 1226 { 1227 if (e.head.length) 1228 return &e; 1229 foreach (ref c; e.children) 1230 { 1231 auto r = firstHead(c); 1232 if (r) 1233 return r; 1234 } 1235 return null; 1236 } 1237 1238 static Token firstToken(Entity e) 1239 { 1240 while (!e.token && e.children.length) 1241 e = e.children[0]; 1242 return e.token; 1243 } 1244 } 1245 1246 public: 1247 1248 /// Split the text into sequences for each fun is always true, and then always false 1249 Entity[] parseSplit(alias fun)(string text) 1250 { 1251 Entity[] result; 1252 size_t i, wordStart, wordEnd; 1253 for (i = 1; i <= text.length; i++) 1254 if (i==text.length || (fun(text[i-1]) && !fun(text[i]))) 1255 { 1256 if (wordStart != i) 1257 result ~= new Entity(text[wordStart..wordEnd], null, text[wordEnd..i]); 1258 wordStart = wordEnd = i; 1259 } 1260 else 1261 if ((!fun(text[i-1]) && fun(text[i]))) 1262 wordEnd = i; 1263 return result; 1264 } 1265 1266 alias parseToWords = parseSplit!isNotAlphaNum; 1267 alias parseToLines = parseSplit!isNewline; 1268 1269 /// Split s on end~start, preserving end and start on each chunk 1270 private string[] split2(string end, string start)(string s) 1271 { 1272 enum sep = end ~ start; 1273 return split2Impl(s, sep, end.length); 1274 } 1275 1276 private string[] split2Impl(string s, string sep, size_t endLength) 1277 { 1278 string[] result; 1279 while (true) 1280 { 1281 auto i = s.indexOf(sep); 1282 if (i < 0) 1283 return result ~ s; 1284 i += endLength; 1285 result ~= s[0..i]; 1286 s = s[i..$]; 1287 } 1288 } 1289 1290 unittest 1291 { 1292 assert(split2!("]", "[")(null) == [""]); 1293 assert(split2!("]", "[")("[foo]") == ["[foo]"]); 1294 assert(split2!("]", "[")("[foo][bar]") == ["[foo]", "[bar]"]); 1295 assert(split2!("]", "[")("[foo] [bar]") == ["[foo] [bar]"]); 1296 } 1297 1298 Entity[] parseDiff(string s) 1299 { 1300 return s 1301 .split2!("\n", "diff ") 1302 .map!( 1303 (string file) 1304 { 1305 auto chunks = file.split2!("\n", "@@ "); 1306 return new Entity(chunks[0], chunks[1..$].map!(chunk => new Entity(chunk)).array); 1307 } 1308 ) 1309 .array 1310 ; 1311 } 1312 1313 Entity[] parseIndent(string s, uint tabWidth) 1314 { 1315 Entity[] root; 1316 Entity[]*[] stack; 1317 1318 foreach (line; s.split2!("\n", "")) 1319 { 1320 size_t indent = 0; 1321 charLoop: 1322 foreach (c; line) 1323 switch (c) 1324 { 1325 case ' ': 1326 indent++; 1327 break; 1328 case '\t': 1329 indent += tabWidth; 1330 break; 1331 case '\r': 1332 case '\n': 1333 // Treat empty (whitespace-only) lines as belonging to the 1334 // immediately higher (most-nested) block. 1335 indent = stack.length; 1336 break charLoop; 1337 default: 1338 break charLoop; 1339 } 1340 1341 auto e = new Entity(line); 1342 foreach_reverse (i; 0 .. min(indent, stack.length)) // non-inclusively up to indent 1343 if (stack[i]) 1344 { 1345 *stack[i] ~= e; 1346 goto parentFound; 1347 } 1348 root ~= e; 1349 parentFound: 1350 stack.length = indent + 1; 1351 stack[indent] = &e.children; 1352 } 1353 1354 return root; 1355 } 1356 1357 private: 1358 1359 bool isNewline(char c) { return c == '\r' || c == '\n'; } 1360 alias isNotAlphaNum = not!isAlphaNum; 1361 1362 // https://d.puremagic.com/issues/show_bug.cgi?id=11824 1363 auto arrayMap(alias fun, T)(T[] arr) 1364 { 1365 alias R = typeof(fun(arr[0])); 1366 auto result = new R[arr.length]; 1367 foreach (i, v; arr) 1368 result[i] = fun(v); 1369 return result; 1370 }