splitter source code

1 /// Simple source code splitter
2 /// Written by Vladimir Panteleev <vladimir@thecybershadow.net>
3 /// License: Boost Software License, Version 1.0
4 
5 module splitter;
6 
7 import std.ascii;
8 import std.algorithm;
9 import std.array;
10 import std.conv;
11 import std.exception;
12 import std.file;
13 import std.functional;
14 import std.path;
15 import std.range;
16 import std.string;
17 import std.traits;
18 import std.stdio : stderr;
19 
20 import polyhash;
21 
22 /// Represents an Entity's position within a program tree.
23 struct Address
24 {
25 	Address* parent;       /// Upper node's Address. If null, then this is the root node (and index should be 0).
26 	size_t index;          /// Index within the parent's children array
27 	size_t depth;          /// Distance from the root address
28 
29 	Address*[] children;   /// Used to keep a global cached tree of addresses.
30 	ref Address* child(size_t index) const
31 	{
32 		auto mutableThis = cast(Address*)&this; // Break const for caching
33 		if (mutableThis.children.length < index + 1)
34 			mutableThis.children.length = index + 1;
35 		if (!mutableThis.children[index])
36 			mutableThis.children[index] = new Address(mutableThis, index, depth+1);
37 		return mutableThis.children[index];
38 	}
39 }
40 
41 struct EntityRef             /// Reference to another Entity in the same tree
42 {
43 	Entity entity;           /// Pointer - only valid during splitting / optimizing
44 	const(Address)* address; /// Address - assigned after splitting / optimizing
45 }
46 
47 enum largest64bitPrime = 18446744073709551557UL; // 0xFFFFFFFF_FFFFFFC5
48 // static if (is(ModQ!(ulong, largest64bitPrime)))
49 static if (modQSupported) // https://issues.dlang.org/show_bug.cgi?id=20677
50 	alias EntityHash = PolynomialHash!(ModQ!(ulong, largest64bitPrime));
51 else
52 {
53 	pragma(msg,
54 		"64-bit long multiplication/division is not supported on this platform.\n" ~
55 		"Falling back to working in modulo 2^^64.\n" ~
56 		"Hashing / cache accuracy may be impaired.\n" ~
57 		"---------------------------------------------------------------------");
58 	alias EntityHash = PolynomialHash!ulong;
59 }
60 
61 /// Represents a slice of the original code.
62 final class Entity
63 {
64 	string head;           /// This node's "head", e.g. "{" for a statement block.
65 	Entity[] children;     /// This node's children nodes, e.g. the statements of the statement block.
66 	string tail;           /// This node's "tail", e.g. "}" for a statement block.
67 
68 	string filename, contents;
69 	@property bool isFile() { return filename != ""; }
70 
71 	bool isPair;           /// Internal hint for --dump output
72 	bool noRemove;         /// Don't try removing this entity (children OK)
73 	bool clean;            /// Computed fields are up-to-date
74 
75 	bool dead;             /// Tombstone or redirect
76 	EntityRef[] dependents;/// If this entity is removed, so should all these entities.
77 	Address* redirect;     /// If moved, this is where this entity is now
78 
79 	int id;                /// For diagnostics
80 	size_t descendants;    /// [Computed] For progress display
81 	EntityHash hash;       /// [Computed] Hashed value of this entity's content (as if it were saved to disk).
82 	const(Address)*[] allDependents; /// [Computed] External dependents of this and child nodes
83 	string deadContents;   /// [Computed] For --white-out - all of this node's contents, with non-whitespace replaced by whitespace
84 	EntityHash deadHash;   /// [Computed] Hash of deadContents
85 
86 	this(string head = null, Entity[] children = null, string tail = null)
87 	{
88 		this.head     = head;
89 		this.children = children;
90 		this.tail     = tail;
91 	}
92 
93 	@property string comment()
94 	{
95 		string[] result;
96 		debug result = comments;
97 		if (isPair)
98 		{
99 			assert(token == DSplitter.Token.none);
100 			result ~= "Pair";
101 		}
102 		if (token && DSplitter.tokenText[token])
103 			result ~= DSplitter.tokenText[token];
104 		return result.length ? result.join(" / ") : null;
105 	}
106 
107 	override string toString() const
108 	{
109 		return "%(%s%) %s %(%s%)".format([head], children, [tail]);
110 	}
111 
112 	Entity dup()           /// Creates a shallow copy
113 	{
114 		auto result = new Entity;
115 		foreach (i, item; this.tupleof)
116 			result.tupleof[i] = this.tupleof[i];
117 		result.children = result.children.dup;
118 		return result;
119 	}
120 
121 	void kill()            /// Convert to tombstone/redirect
122 	{
123 		dependents = null;
124 		isPair = false;
125 		descendants = 0;
126 		allDependents = null;
127 		dead = true;
128 	}
129 
130 private: // Used during parsing only
131 	DSplitter.Token token;    /// Used internally
132 
133 	debug string[] comments;  /// Used to debug the splitter
134 }
135 
136 enum Mode
137 {
138 	source,
139 	words,     /// split identifiers, for obfuscation
140 }
141 
142 enum Splitter
143 {
144 	files,     /// Load entire files only
145 	lines,     /// Split by line ends
146 	words,     /// Split by whitespace
147 	D,         /// Parse D source code
148 	diff,      /// Unified diffs
149 	indent,    /// Indentation (Python, YAML...)
150 }
151 immutable string[] splitterNames = [EnumMembers!Splitter].map!(e => e.text().toLower()).array();
152 
153 struct ParseRule
154 {
155 	string pattern;
156 	Splitter splitter;
157 }
158 
159 struct ParseOptions
160 {
161 	enum Mode { source, words }
162 
163 	bool stripComments;
164 	ParseRule[] rules;
165 	Mode mode;
166 	uint tabWidth;
167 }
168 
169 /// Parse the given file/directory.
170 /// For files, modifies path to be the base name for .test / .reduced directories.
171 Entity loadFiles(ref string path, ParseOptions options)
172 {
173 	if (isFile(path))
174 	{
175 		auto filePath = path;
176 		path = stripExtension(path);
177 		return loadFile(filePath.baseName(), filePath, options);
178 	}
179 	else
180 	{
181 		auto set = new Entity();
182 		foreach (string entry; dirEntries(path, SpanMode.breadth).array.sort!((a, b) => a.name < b.name))
183 			if (isFile(entry))
184 			{
185 				assert(entry.startsWith(path));
186 				auto name = entry[path.length+1..$];
187 				set.children ~= loadFile(name, entry, options);
188 			}
189 		return set;
190 	}
191 }
192 
193 enum BIN_SIZE = 2;
194 
195 void optimizeUntil(alias stop)(Entity set)
196 {
197 	static Entity group(Entity[] children)
198 	{
199 		if (children.length == 1)
200 			return children[0];
201 		auto e = new Entity(null, children, null);
202 		e.noRemove = children.any!(c => c.noRemove)();
203 		return e;
204 	}
205 
206 	static void clusterBy(ref Entity[] set, size_t binSize)
207 	{
208 		while (set.length > binSize)
209 		{
210 			auto size = set.length >= binSize*2 ? binSize : (set.length+1) / 2;
211 			//auto size = binSize;
212 
213 			set = set.chunks(size).map!group.array;
214 		}
215 	}
216 
217 	void doOptimize(Entity e)
218 	{
219 		if (stop(e))
220 			return;
221 		foreach (c; e.children)
222 			doOptimize(c);
223 		clusterBy(e.children, BIN_SIZE);
224 	}
225 
226 	doOptimize(set);
227 }
228 
229 alias optimize = optimizeUntil!((Entity e) => false);
230 
231 private:
232 
233 /// Override std.string nonsense, which does UTF-8 decoding
234 bool startsWith(in char[] big, in char[] small) { return big.length >= small.length && big[0..small.length] == small; }
235 bool startsWith(in char[] big, char c) { return big.length && big[0] == c; }
236 string strip(string s) { while (s.length && isWhite(s[0])) s = s[1..$]; while (s.length && isWhite(s[$-1])) s = s[0..$-1]; return s; }
237 
238 immutable ParseRule[] defaultRules =
239 [
240 	{ "*.d"    , Splitter.D     },
241 	{ "*.di"   , Splitter.D     },
242 	{ "*.diff" , Splitter.diff  },
243 	{ "*.patch", Splitter.diff  },
244 	{ "*"      , Splitter.files },
245 ];
246 
247 Entity loadFile(string name, string path, ParseOptions options)
248 {
249 	stderr.writeln("Loading ", path);
250 	auto result = new Entity();
251 	result.filename = name.replace(`\`, `/`);
252 	result.contents = cast(string)read(path);
253 
254 	auto base = name.baseName();
255 	foreach (rule; chain(options.rules, defaultRules))
256 		if (base.globMatch(rule.pattern))
257 		{
258 			final switch (rule.splitter)
259 			{
260 				case Splitter.files:
261 					result.children = [new Entity(result.contents, null, null)];
262 					return result;
263 				case Splitter.lines:
264 					result.children = parseToLines(result.contents);
265 					return result;
266 				case Splitter.words:
267 					result.children = parseToWords(result.contents);
268 					return result;
269 				case Splitter.D:
270 				{
271 					if (result.contents.startsWith("Ddoc"))
272 						goto case Splitter.files;
273 
274 					DSplitter splitter;
275 					if (options.stripComments)
276 						result.contents = splitter.stripComments(result.contents);
277 
278 					final switch (options.mode)
279 					{
280 						case ParseOptions.Mode.source:
281 							result.children = splitter.parse(result.contents);
282 							return result;
283 						case ParseOptions.Mode.words:
284 							result.children = splitter.parseToWords(result.contents);
285 							return result;
286 					}
287 				}
288 				case Splitter.diff:
289 					result.children = parseDiff(result.contents);
290 					return result;
291 				case Splitter.indent:
292 					result.children = parseIndent(result.contents, options.tabWidth);
293 					return result;
294 			}
295 		}
296 	assert(false); // default * rule should match everything
297 }
298 
299 // *****************************************************************************************************************************************************************************
300 
301 /// A simple, error-tolerant D source code splitter.
302 struct DSplitter
303 {
304 	struct Pair { string start, end; }
305 	static const Pair[] pairs =
306 	[
307 		{ "{", "}" },
308 		{ "[", "]" },
309 		{ "(", ")" },
310 	];
311 
312 	static immutable string[] blockKeywords = ["try", "catch", "finally", "while", "do", "in", "out", "body", "if", "static if", "else", "for", "foreach"];
313 	static immutable string[] parenKeywords = ["catch", "while", "if", "static if", "for", "foreach"];
314 
315 	/// The order dictates the splitting priority of the separators.
316 	static immutable string[][] separators =
317 	[
318 		[";", "{"] ~ blockKeywords,
319 		["import"],
320 		// From http://wiki.dlang.org/Operator_precedence
321 		// Some items are listed twice, DustMite does not distinguish binary/prefix/postfix operators
322 		[".."],
323 		[","],
324 		["=>"],
325 		["=", "-=", "+=", "<<=", ">>=", ">>>=", "=", "*=", "%=", "^=", "^^=", "~="],
326 		["?", ":"],
327 		["||"],
328 		["&&"],
329 		["|"],
330 		["^"],
331 		["&"],
332 		["==", "!=", ">", "<", ">=", "<=", "!>", "!<", "!>=", "!<=", "<>", "!<>", "<>=", "!<>=", "in", "!in", "is", "!is"],
333 		["<<", ">>", ">>>"],
334 		["+", "-", "~"],
335 		["*", "/", "%"],
336 		["&", "++", "--", "*", "+", "-", /*"!",*/ "~"],
337 		["^^"],
338 		[".", "++", "--" /*, "(", "["*/],
339 		// "=>",
340 		["!"],
341 		["(", "["]
342 	];
343 
344 	enum Token : int
345 	{
346 		none,
347 		end,        /// end of stream
348 		whitespace, /// spaces, tabs, newlines
349 		comment,    /// all forms of comments
350 		other,      /// identifiers, literals and misc. keywords
351 
352 		generated0, /// First value of generated tokens (see below)
353 
354 		max = tokenText.length
355 	}
356 
357 	static immutable string[] tokenText =
358 	{
359 		auto result = new string[Token.generated0];
360 		Token[string] lookup;
361 
362 		Token add(string s)
363 		{
364 			auto p = s in lookup;
365 			if (p)
366 				return *p;
367 
368 			Token t = cast(Token)result.length;
369 			result ~= s;
370 			lookup[s] = t;
371 			return t;
372 		}
373 
374 		foreach (pair; pairs)
375 		{
376 			add(pair.start);
377 			add(pair.end  );
378 		}
379 
380 		foreach (i, synonyms; separators)
381 			foreach (sep; synonyms)
382 				add(sep);
383 		return result;
384 	}();
385 
386 	static Token lookupToken(string s)
387 	{
388 		if (!__ctfe) assert(false, "Don't use at runtime");
389 		foreach (t; Token.generated0 .. Token.max)
390 			if (s == tokenText[t])
391 				return t;
392 		assert(false, "No such token: " ~ s);
393 	}
394 	enum Token tokenLookup(string s) = lookupToken(s);
395 
396 	struct TokenPair { Token start, end; }
397 	static TokenPair makeTokenPair(Pair pair) { return TokenPair(lookupToken(pair.start), lookupToken(pair.end)); }
398 	alias lookupTokens = arrayMap!(lookupToken, const string);
399 	static immutable TokenPair[] pairTokens      = pairs     .arrayMap!makeTokenPair();
400 	static immutable Token[][]   separatorTokens = separators.arrayMap!lookupTokens ();
401 	static immutable Token[] blockKeywordTokens = blockKeywords.arrayMap!lookupToken();
402 	static immutable Token[] parenKeywordTokens = parenKeywords.arrayMap!lookupToken();
403 
404 	enum SeparatorType
405 	{
406 		none,
407 		pair,
408 		prefix,
409 		postfix,
410 		binary,  /// infers dependency
411 	}
412 
413 	static SeparatorType getSeparatorType(Token t)
414 	{
415 		switch (t)
416 		{
417 			case tokenLookup!";":
418 				return SeparatorType.postfix;
419 			case tokenLookup!"import":
420 				return SeparatorType.prefix;
421 			case tokenLookup!"else":
422 				return SeparatorType.binary;
423 			default:
424 				if (pairTokens.any!(pair => pair.start == t))
425 					return SeparatorType.pair;
426 				if (blockKeywordTokens.canFind(t))
427 					return SeparatorType.prefix;
428 				if (separatorTokens.any!(row => row.canFind(t)))
429 					return SeparatorType.binary;
430 				return SeparatorType.none;
431 		}
432 	}
433 
434 	// ************************************************************************
435 
436 	string s; /// Source code we are splitting
437 	size_t i; /// Cursor
438 
439 	/// Making the end of an input stream a throwable greatly simplifies control flow.
440 	class EndOfInput : Throwable { this() { super(null); } }
441 
442 	/// Are we at the end of the stream?
443 	@property bool eos() { return i >= s.length; }
444 
445 	/// Advances i by n bytes. Throws EndOfInput if there are not enough.
446 	string advance(size_t n)
447 	{
448 		auto e = i + n;
449 		if (e > s.length)
450 		{
451 			i = s.length;
452 			throw new EndOfInput;
453 		}
454 		auto result = s[i..e];
455 		i = e;
456 		return result;
457 	}
458 
459 	/// ditto
460 	char advance() { return advance(1)[0]; }
461 
462 	/// If pre comes next, advance i through pre and return it.
463 	/// Otherwise, return null.
464 	string consume(string pre)
465 	{
466 		if (s[i..$].startsWith(pre))
467 			return advance(pre.length);
468 		else
469 			return null;
470 	}
471 
472 	/// ditto
473 	char consume(char pre)
474 	{
475 		assert(pre);
476 		if (s[i..$].startsWith(pre))
477 			return advance();
478 		else
479 			return 0;
480 	}
481 
482 	/// Peeks at the next n characters.
483 	string peek(size_t n)
484 	{
485 		if (i+n > s.length)
486 			throw new EndOfInput;
487 		return s[i..i+n];
488 	}
489 
490 	/// ditto
491 	char peek() { return peek(1)[0]; }
492 
493 	/// Advances i through one token (whether it's a comment,
494 	/// a series of whitespace characters, or something else).
495 	/// Returns the token type.
496 	Token skipTokenOrWS()
497 	{
498 		if (eos)
499 			return Token.end;
500 
501 		Token result = Token.other;
502 		try
503 		{
504 			auto c = advance();
505 			switch (c)
506 			{
507 			case '\'':
508 				result = Token.other;
509 				if (consume('\\'))
510 					advance();
511 				while (advance() != '\'')
512 					continue;
513 				break;
514 			case '\\':  // D1 naked escaped string
515 				result = Token.other;
516 				advance();
517 				break;
518 			case '"':
519 				result = Token.other;
520 				while (peek() != '"')
521 				{
522 					if (advance() == '\\')
523 						advance();
524 				}
525 				advance();
526 				break;
527 			case 'r':
528 				if (consume(`"`))
529 				{
530 					result = Token.other;
531 					while (advance() != '"')
532 						continue;
533 					break;
534 				}
535 				else
536 					goto default;
537 			case '`':
538 				result = Token.other;
539 				while (advance() != '`')
540 					continue;
541 				break;
542 			case '/':
543 				if (consume('/'))
544 				{
545 					result = Token.comment;
546 					while (peek() != '\r' && peek() != '\n')
547 						advance();
548 				}
549 				else
550 				if (consume('*'))
551 				{
552 					result = Token.comment;
553 					while (!consume("*/"))
554 						advance();
555 				}
556 				else
557 				if (consume('+'))
558 				{
559 					result = Token.comment;
560 					int commentLevel = 1;
561 					while (commentLevel)
562 					{
563 						if (consume("/+"))
564 							commentLevel++;
565 						else
566 						if (consume("+/"))
567 							commentLevel--;
568 						else
569 							advance();
570 					}
571 				}
572 				else
573 					goto default;
574 				break;
575 			case '@':
576 				if (consume("disable")
577 				 || consume("property")
578 				 || consume("safe")
579 				 || consume("trusted")
580 				 || consume("system")
581 				)
582 					return Token.other;
583 				goto default;
584 			case '#':
585 				result = Token.other;
586 				do
587 				{
588 					c = advance();
589 					if (c == '\\')
590 						c = advance();
591 				}
592 				while (c != '\n');
593 				break;
594 			default:
595 				{
596 					i--;
597 					Token best;
598 					size_t bestLength;
599 					foreach (Token t; Token.init..Token.max)
600 					{
601 						auto text = tokenText[t];
602 						if (!text)
603 							continue;
604 						if (!s[i..$].startsWith(text))
605 							continue;
606 						if (text[$-1].isAlphaNum() && s.length > i + text.length && s[i + text.length].isAlphaNum())
607 							continue; // if the token is a word, it must end at a word boundary
608 						if (text.length <= bestLength)
609 							continue;
610 						best = t;
611 						bestLength = text.length;
612 					}
613 					if (bestLength)
614 					{
615 						auto consumed = consume(tokenText[best]);
616 						assert(consumed);
617 						return best;
618 					}
619 
620 					i++;
621 				}
622 				if (c.isWhite())
623 				{
624 					result = Token.whitespace;
625 					while (peek().isWhite())
626 						advance();
627 				}
628 				else
629 				if (isWordChar(c))
630 				{
631 					result = Token.other;
632 					while (isWordChar(peek()))
633 						advance();
634 				}
635 				else
636 					result = Token.other;
637 			}
638 		}
639 		catch (EndOfInput)
640 			i = s.length;
641 		return result;
642 	}
643 
644 	/// Skips leading and trailing whitespace/comments, too.
645 	/// Never returns Token.whitespace or Token.comment.
646 	void readToken(out Token type, out string span)
647 	{
648 		size_t spanStart = i;
649 		do
650 			type = skipTokenOrWS();
651 		while (type == Token.whitespace || type == Token.comment);
652 		skipToEOL();
653 		span = s[spanStart..i];
654 		if (type == Token.end && span.length)
655 			type = Token.whitespace;
656 	}
657 
658 	/// Moves i forward over first series of EOL characters,
659 	/// or until first non-whitespace character, whichever comes first.
660 	void skipToEOL()
661 	{
662 		try
663 			while (true)
664 			{
665 				auto c = peek();
666 				if (c == '\r' || c == '\n')
667 				{
668 					do
669 						advance(), c = peek();
670 					while (c == '\r' || c == '\n');
671 					return;
672 				}
673 				else
674 				if (c.isWhite())
675 					i++;
676 				else
677 				if (peek(2) == "//")
678 				{
679 					auto t = skipTokenOrWS();
680 					assert(t == Token.comment);
681 				}
682 				else
683 					break;
684 			}
685 		catch (EndOfInput)
686 			i = s.length;
687 	}
688 
689 	static bool isWordChar(char c)
690 	{
691 		return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_';
692 	}
693 
694 	void reset(string code)
695 	{
696 		s = code;
697 		i = 0;
698 	}
699 
700 	void parseScope(Entity result, Token scopeEnd)
701 	{
702 		enum Level : int
703 		{
704 			zero,
705 			separator0,
706 			separatorMax = separator0 + separators.length - 1,
707 			text,
708 			max
709 		}
710 
711 		Entity[][Level.max] splitterQueue;
712 
713 		Entity[] terminateLevel(Level level)
714 		{
715 			level++;
716 			if (level == Level.max)
717 				return null;
718 
719 			auto r = splitterQueue[level] ~ group(terminateLevel(level));
720 			splitterQueue[level] = null;
721 			return r;
722 		}
723 
724 		while (true)
725 		{
726 			Token token;
727 			string span;
728 			readToken(token, span);
729 
730 			if (token == Token.end)
731 			{
732 				assert(span.empty);
733 				break;
734 			}
735 			if (token == scopeEnd)
736 			{
737 				result.tail = span;
738 				break;
739 			}
740 
741 			auto e = new Entity();
742 			e.token = token;
743 			auto level = Level.text;
744 
745 			Entity after;
746 
747 			foreach (n, synonyms; separatorTokens)
748 				foreach (sep; synonyms)
749 					if (token == sep)
750 					{
751 						level = cast(Level)(Level.separator0 + n);
752 						e.children = terminateLevel(level);
753 						auto type = getSeparatorType(token);
754 						if (type == SeparatorType.prefix || type == SeparatorType.pair)
755 						{
756 							Entity empty = e;
757 							if (e.children.length)
758 							{
759 								e.token = Token.none;
760 								after = empty = new Entity(span);
761 								after.token = token;
762 							}
763 							else
764 								e.head = span;
765 
766 							if (type == SeparatorType.pair)
767 								parseScope(empty, pairTokens.find!(pair => pair.start == token).front.end);
768 						}
769 						else
770 							e.tail = span;
771 						goto handled;
772 					}
773 
774 			e.head = span;
775 
776 		handled:
777 			splitterQueue[level] ~= e;
778 			if (after)
779 				splitterQueue[level] ~= after;
780 		}
781 
782 		result.children ~= terminateLevel(Level.zero);
783 	}
784 
785 	Entity[] parse(string code)
786 	{
787 		reset(code);
788 		auto entity = new Entity;
789 		parseScope(entity, Token.none);
790 		assert(!entity.head && !entity.tail);
791 		postProcess(entity.children);
792 		return [entity];
793 	}
794 
795 	Entity[] parseToWords(string code)
796 	{
797 		reset(code);
798 		Entity[] result;
799 		while (!eos)
800 		{
801 			auto start = i;
802 			auto token = skipTokenOrWS();
803 			auto span = s[start..i];
804 
805 			if (token == Token.other)
806 				result ~= new Entity(span);
807 			else
808 			{
809 				if (!result.length)
810 					result ~= new Entity();
811 				if (!result[$-1].tail)
812 					result[$-1].tail = span;
813 				else
814 				{
815 					start = result[$-1].tail.ptr - s.ptr;
816 					result[$-1].tail = s[start..i];
817 				}
818 			}
819 		}
820 		return result;
821 	}
822 
823 	string stripComments(string code)
824 	{
825 		reset(code);
826 		auto result = appender!string();
827 		while (!eos)
828 		{
829 			auto start = i;
830 			Token t = skipTokenOrWS();
831 			if (t != Token.comment)
832 				result.put(s[start..i]);
833 		}
834 		return result.data;
835 	}
836 
837 	static Entity[] group(Entity[] entities)
838 	{
839 		if (entities.length <= 1)
840 			return entities;
841 		return [new Entity(null, entities, null)];
842 	}
843 
844 	static void postProcessSimplify(ref Entity[] entities)
845 	{
846 		for (size_t i=0; i<entities.length;)
847 		{
848 			auto e = entities[i];
849 			if (e.head.empty && e.tail.empty && e.dependents.empty)
850 			{
851 				assert(e.token == Token.none);
852 				if (e.children.length == 0)
853 				{
854 					entities = entities.remove(i);
855 					continue;
856 				}
857 				else
858 				if (e.children.length == 1)
859 				{
860 					entities[i] = e.children[0];
861 					continue;
862 				}
863 			}
864 
865 			i++;
866 		}
867 	}
868 
869 	static void postProcessDependency(ref Entity[] entities)
870 	{
871 		if (entities.length < 2)
872 		{
873 			foreach (e; entities)
874 				postProcessDependency(e.children);
875 			return;
876 		}
877 
878 		size_t[] points;
879 		foreach_reverse (i, e; entities[0..$-1])
880 			if (getSeparatorType(e.token) == SeparatorType.binary && e.children)
881 				points ~= i;
882 
883 		if (points.length)
884 		{
885 			auto i = points[$/2];
886 			auto e = entities[i];
887 
888 			auto head = entities[0..i] ~ group(e.children);
889 			e.children = null;
890 			auto tail = new Entity(null, group(entities[i+1..$]), null);
891 			tail.dependents ~= EntityRef(e);
892 			entities = group(head ~ e) ~ tail;
893 			foreach (c; entities)
894 				postProcessDependency(c.children);
895 		}
896 	}
897 
898 	static void postProcessTemplates(ref Entity[] entities)
899 	{
900 		if (!entities.length)
901 			return;
902 		foreach_reverse (i, e; entities[0..$-1])
903 			if (e.token == tokenLookup!"!" && entities[i+1].children.length && entities[i+1].children[0].token == tokenLookup!"(")
904 			{
905 				auto dependency = new Entity;
906 				dependency.dependents = [EntityRef(e), EntityRef(entities[i+1].children[0])];
907 				entities = entities[0..i+1] ~ dependency ~ entities[i+1..$];
908 			}
909 	}
910 
911 	static void postProcessDependencyBlock(ref Entity[] entities)
912 	{
913 		foreach (i, e; entities)
914 			if (i && !e.token && e.children.length && getSeparatorType(e.children[0].token) == SeparatorType.binary && !e.children[0].children)
915 				entities[i-1].dependents ~= EntityRef(e.children[0]);
916 	}
917 
918 	static void postProcessBlockKeywords(ref Entity[] entities)
919 	{
920 		foreach_reverse (i; 0 .. entities.length)
921 		{
922 			if (blockKeywordTokens.canFind(entities[i].token) && i+1 < entities.length)
923 			{
924 				auto j = i + 1;
925 				if (j < entities.length && entities[j].token == tokenLookup!"(")
926 					j++;
927 				j++; // ; or {
928 				if (j <= entities.length)
929 					entities = entities[0..i] ~ group(group(entities[i..j-1]) ~ entities[j-1..j]) ~ entities[j..$];
930 			}
931 		}
932 	}
933 
934 	static void postProcessBlockStatements(ref Entity[] entities)
935 	{
936 		for (size_t i=0; i<entities.length;)
937 		{
938 			auto j = i;
939 			bool consume(Token t)
940 			{
941 				if (j < entities.length
942 				 && entities[j].children.length == 2
943 				 && firstToken(entities[j].children[0]) == t)
944 				{
945 					j++;
946 					return true;
947 				}
948 				return false;
949 			}
950 
951 			if (consume(tokenLookup!"if") || consume(tokenLookup!"static if"))
952 				consume(tokenLookup!"else");
953 			else
954 			if (consume(tokenLookup!"do"))
955 				consume(tokenLookup!"while");
956 			else
957 			if (consume(tokenLookup!"try"))
958 			{
959 				while (consume(tokenLookup!"catch"))
960 					continue;
961 				consume(tokenLookup!"finally");
962 			}
963 
964 			if (i == j)
965 			{
966 				j++;
967 				while (consume(tokenLookup!"in") || consume(tokenLookup!"out") || consume(tokenLookup!"body"))
968 					continue;
969 			}
970 
971 			if (j-i > 1)
972 			{
973 				entities = entities[0..i] ~ group(entities[i..j]) ~ entities[j..$];
974 				continue;
975 			}
976 
977 			i++;
978 		}
979 	}
980 
981 	static void postProcessPairs(ref Entity[] entities)
982 	{
983 		size_t lastPair = 0;
984 
985 		for (size_t i=0; i<entities.length;)
986 		{
987 			// Create pair entities
988 
989 			if (entities[i].token == tokenLookup!"{")
990 			{
991 				if (i >= lastPair + 1)
992 				{
993 					lastPair = i-1;
994 					entities = entities[0..lastPair] ~ group(group(entities[lastPair..i]) ~ entities[i]) ~ entities[i+1..$];
995 					i = lastPair;
996 					entities[i].isPair = true;
997 					lastPair++;
998 					continue;
999 				}
1000 				else
1001 					lastPair = i + 1;
1002 			}
1003 			else
1004 			if (entities[i].token == tokenLookup!";")
1005 				lastPair = i + 1;
1006 
1007 			i++;
1008 		}
1009 	}
1010 
1011 	static void postProcessParens(ref Entity[] entities)
1012 	{
1013 		for (size_t i=0; i+1 < entities.length;)
1014 		{
1015 			if (parenKeywordTokens.canFind(entities[i].token))
1016 			{
1017 				auto pparen = firstHead(entities[i+1]);
1018 				if (pparen
1019 				 && *pparen !is entities[i+1]
1020 				 && pparen.token == tokenLookup!"(")
1021 				{
1022 					auto paren = *pparen;
1023 					*pparen = new Entity();
1024 					entities = entities[0..i] ~ group([entities[i], paren]) ~ entities[i+1..$];
1025 					continue;
1026 				}
1027 			}
1028 
1029 			i++;
1030 		}
1031 
1032 		foreach (e; entities)
1033 			postProcessParens(e.children);
1034 	}
1035 
1036 	static bool isValidIdentifier(string s)
1037 	{
1038 		if (!s.length)
1039 			return false;
1040 		if (!isAlpha(s[0]))
1041 			return false;
1042 		foreach (c; s[1..$])
1043 			if (!isAlphaNum(c))
1044 				return false;
1045 		return true;
1046 	}
1047 
1048 	/// Get all nodes between (exclusively) two addresses.
1049 	/// If either address is empty, then the respective bound is the respective extreme.
1050 	static Entity[] nodesBetween(Entity root, size_t[] a, size_t[] b)
1051 	{
1052 		while (a.length && b.length && a[0] == b[0])
1053 		{
1054 			root = root.children[a[0]];
1055 			a = a[1..$];
1056 			b = b[1..$];
1057 		}
1058 		size_t index0, index1;
1059 		Entity[] children0, children1;
1060 		if (a.length)
1061 		{
1062 			index0 = a[0] + 1;
1063 			if (a.length > 1)
1064 				children0 = nodesBetween(root.children[a[0]], a[1..$], null);
1065 		}
1066 		else
1067 			index0 = 0;
1068 
1069 		if (b.length)
1070 		{
1071 			index1 = b[0];
1072 			if (b.length > 1)
1073 				children1 = nodesBetween(root.children[b[0]], null, b[1..$]);
1074 		}
1075 		else
1076 			index1 = root.children.length;
1077 
1078 		assert(index0 <= index1);
1079 		return children0 ~ root.children[index0 .. index1] ~ children1;
1080 	}
1081 
1082 	static void postProcessRecursive(ref Entity[] entities)
1083 	{
1084 		foreach (e; entities)
1085 			if (e.children.length)
1086 				postProcessRecursive(e.children);
1087 
1088 		postProcessSimplify(entities);
1089 		postProcessTemplates(entities);
1090 		postProcessDependency(entities);
1091 		postProcessBlockKeywords(entities);
1092 		postProcessDependencyBlock(entities);
1093 		postProcessBlockStatements(entities);
1094 		postProcessPairs(entities);
1095 		postProcessParens(entities);
1096 	}
1097 
1098 	/// Attempt to link together function arguments / parameters for
1099 	/// things that look like calls to the same function, to allow removing
1100 	/// unused function arguments / parameters.
1101 	static void postProcessArgs(ref Entity[] entities)
1102 	{
1103 		string lastID;
1104 
1105 		Entity[][][string] calls;
1106 
1107 		void visit(Entity entity)
1108 		{
1109 			auto id = entity.head.strip();
1110 			if (entity.token == Token.other && isValidIdentifier(id) && !entity.tail && !entity.children)
1111 				lastID = id;
1112 			else
1113 			if (lastID && entity.token == tokenLookup!"(")
1114 			{
1115 				size_t[] stack;
1116 				struct Comma { size_t[] addr, after; }
1117 				Comma[] commas;
1118 
1119 				bool afterComma;
1120 
1121 				// Find all top-level commas
1122 				void visit2(size_t i, Entity entity)
1123 				{
1124 					stack ~= i;
1125 					if (afterComma)
1126 					{
1127 						commas[$-1].after = stack;
1128 						//entity.comments ~= "After-comma %d".format(commas.length);
1129 						afterComma = false;
1130 					}
1131 
1132 					if (entity.token == tokenLookup!",")
1133 					{
1134 						commas ~= Comma(stack);
1135 						//entity.comments ~= "Comma %d".format(commas.length);
1136 						afterComma = true;
1137 					}
1138 					else
1139 					if (entity.head.length || entity.tail.length)
1140 						{}
1141 					else
1142 						foreach (j, child; entity.children)
1143 							visit2(j, child);
1144 					stack = stack[0..$-1];
1145 				}
1146 
1147 				foreach (i, child; entity.children)
1148 					visit2(i, child);
1149 
1150 				// Find all nodes between commas, effectively obtaining the arguments
1151 				size_t[] last = null;
1152 				commas ~= [Comma()];
1153 				Entity[][] args;
1154 				foreach (i, comma; commas)
1155 				{
1156 					//Entity entityAt(Entity root, size_t[] address) { return address.length ? entityAt(root.children[address[0]], address[1..$]) : root; }
1157 					//entityAt(entity, last).comments ~= "nodesBetween-left %d".format(i);
1158 					//entityAt(entity, comma.after).comments ~= "nodesBetween-right %d".format(i);
1159 					args ~= nodesBetween(entity, last, comma.after);
1160 					last = comma.addr;
1161 				}
1162 
1163 				// Register the arguments
1164 				foreach (i, arg; args)
1165 				{
1166 					debug
1167 						foreach (j, e; arg)
1168 							e.comments ~= "%s arg %d node %d".format(lastID, i, j);
1169 
1170 					if (arg.length == 1)
1171 					{
1172 						if (lastID !in calls)
1173 							calls[lastID] = null;
1174 						while (calls[lastID].length < i+1)
1175 							calls[lastID] ~= null;
1176 						calls[lastID][i] ~= arg[0];
1177 					}
1178 				}
1179 
1180 				lastID = null;
1181 				return;
1182 			}
1183 			else
1184 			if (entity.token == tokenLookup!"!")
1185 				{}
1186 			else
1187 			if (entity.head || entity.tail)
1188 				lastID = null;
1189 
1190 			foreach (child; entity.children)
1191 				visit(child);
1192 		}
1193 
1194 		foreach (entity; entities)
1195 			visit(entity);
1196 
1197 		// For each parameter, create a dummy empty node which is a dependency for all of the arguments.
1198 		auto callRoot = new Entity();
1199 		debug callRoot.comments ~= "Args root";
1200 		entities ~= callRoot;
1201 
1202 		foreach (id; calls.keys.sort())
1203 		{
1204 			auto funRoot = new Entity();
1205 			debug funRoot.comments ~= "%s root".format(id);
1206 			callRoot.children ~= funRoot;
1207 
1208 			foreach (i, args; calls[id])
1209 			{
1210 				auto e = new Entity();
1211 				debug e.comments ~= "%s param %d".format(id, i);
1212 				funRoot.children ~= e;
1213 				foreach (arg; args)
1214 					e.dependents ~= EntityRef(arg);
1215 			}
1216 		}
1217 	}
1218 
1219 	static void postProcess(ref Entity[] entities)
1220 	{
1221 		postProcessRecursive(entities);
1222 		postProcessArgs(entities);
1223 	}
1224 
1225 	static Entity* firstHead(ref return Entity e)
1226 	{
1227 		if (e.head.length)
1228 			return &e;
1229 		foreach (ref c; e.children)
1230 		{
1231 			auto r = firstHead(c);
1232 			if (r)
1233 				return r;
1234 		}
1235 		return null;
1236 	}
1237 
1238 	static Token firstToken(Entity e)
1239 	{
1240 		while (!e.token && e.children.length)
1241 			e = e.children[0];
1242 		return e.token;
1243 	}
1244 }
1245 
1246 public:
1247 
1248 /// Split the text into sequences for each fun is always true, and then always false
1249 Entity[] parseSplit(alias fun)(string text)
1250 {
1251 	Entity[] result;
1252 	size_t i, wordStart, wordEnd;
1253 	for (i = 1; i <= text.length; i++)
1254 		if (i==text.length || (fun(text[i-1]) && !fun(text[i])))
1255 		{
1256 			if (wordStart != i)
1257 				result ~= new Entity(text[wordStart..wordEnd], null, text[wordEnd..i]);
1258 			wordStart = wordEnd = i;
1259 		}
1260 		else
1261 		if ((!fun(text[i-1]) && fun(text[i])))
1262 			wordEnd = i;
1263 	return result;
1264 }
1265 
1266 alias parseToWords = parseSplit!isNotAlphaNum;
1267 alias parseToLines = parseSplit!isNewline;
1268 
1269 /// Split s on end~start, preserving end and start on each chunk
1270 private string[] split2(string end, string start)(string s)
1271 {
1272 	enum sep = end ~ start;
1273 	return split2Impl(s, sep, end.length);
1274 }
1275 
1276 private string[] split2Impl(string s, string sep, size_t endLength)
1277 {
1278 	string[] result;
1279 	while (true)
1280 	{
1281 		auto i = s.indexOf(sep);
1282 		if (i < 0)
1283 			return result ~ s;
1284 		i += endLength;
1285 		result ~= s[0..i];
1286 		s = s[i..$];
1287 	}
1288 }
1289 
1290 unittest
1291 {
1292 	assert(split2!("]", "[")(null) == [""]);
1293 	assert(split2!("]", "[")("[foo]") == ["[foo]"]);
1294 	assert(split2!("]", "[")("[foo][bar]") == ["[foo]", "[bar]"]);
1295 	assert(split2!("]", "[")("[foo] [bar]") == ["[foo] [bar]"]);
1296 }
1297 
1298 Entity[] parseDiff(string s)
1299 {
1300 	return s
1301 		.split2!("\n", "diff ")
1302 		.map!(
1303 			(string file)
1304 			{
1305 				auto chunks = file.split2!("\n", "@@ ");
1306 				return new Entity(chunks[0], chunks[1..$].map!(chunk => new Entity(chunk)).array);
1307 			}
1308 		)
1309 		.array
1310 	;
1311 }
1312 
1313 Entity[] parseIndent(string s, uint tabWidth)
1314 {
1315 	Entity[] root;
1316 	Entity[]*[] stack;
1317 
1318 	foreach (line; s.split2!("\n", ""))
1319 	{
1320 		size_t indent = 0;
1321 	charLoop:
1322 		foreach (c; line)
1323 			switch (c)
1324 			{
1325 				case ' ':
1326 					indent++;
1327 					break;
1328 				case '\t':
1329 					indent += tabWidth;
1330 					break;
1331 				case '\r':
1332 				case '\n':
1333 					// Treat empty (whitespace-only) lines as belonging to the
1334 					// immediately higher (most-nested) block.
1335 					indent = stack.length;
1336 					break charLoop;
1337 				default:
1338 					break charLoop;
1339 			}
1340 
1341 		auto e = new Entity(line);
1342 		foreach_reverse (i; 0 .. min(indent, stack.length)) // non-inclusively up to indent
1343 			if (stack[i])
1344 			{
1345 				*stack[i] ~= e;
1346 				goto parentFound;
1347 			}
1348 		root ~= e;
1349 	parentFound:
1350 		stack.length = indent + 1;
1351 		stack[indent] = &e.children;
1352 	}
1353 
1354 	return root;
1355 }
1356 
1357 private:
1358 
1359 bool isNewline(char c) { return c == '\r' || c == '\n'; }
1360 alias isNotAlphaNum = not!isAlphaNum;
1361 
1362 // https://d.puremagic.com/issues/show_bug.cgi?id=11824
1363 auto arrayMap(alias fun, T)(T[] arr)
1364 {
1365 	alias R = typeof(fun(arr[0]));
1366 	auto result = new R[arr.length];
1367 	foreach (i, v; arr)
1368 		result[i] = fun(v);
1369 	return result;
1370 }