splitter source code

1 /// Simple source code splitter
2 /// Written by Vladimir Panteleev <vladimir@thecybershadow.net>
3 /// Released into the Public Domain
4 
5 module splitter;
6 
7 import std.ascii;
8 import std.algorithm;
9 import std.array;
10 import std.conv;
11 import std.exception;
12 import std.file;
13 import std.functional;
14 import std.path;
15 import std.range;
16 import std..string;
17 import std.traits;
18 import std.stdio : stderr;
19 
20 /// Represents a slice of the original code.
21 class Entity
22 {
23 	string head;           /// This node's "head", e.g. "{" for a statement block.
24 	Entity[] children;     /// This node's children nodes, e.g. the statements of the statement block.
25 	string tail;           /// This node's "tail", e.g. "}" for a statement block.
26 
27 	string filename, contents;
28 	@property bool isFile() { return filename != ""; }
29 
30 	bool isPair;           /// Internal hint for --dump output
31 	bool noRemove;         /// Don't try removing this entity (children OK)
32 
33 	bool removed;          /// For dangling dependencies
34 	Entity[] dependencies; /// If any of these entities are omitted, so should this entity.
35 
36 	int id;                /// For diagnostics
37 	size_t descendants;    /// For progress display
38 
39 	DSplitter.Token token; /// Used internally
40 
41 	this(string head = null, Entity[] children = null, string tail = null)
42 	{
43 		this.head     = head;
44 		this.children = children;
45 		this.tail     = tail;
46 	}
47 
48 	string[] comments;
49 
50 	@property string comment()
51 	{
52 		string[] result = comments;
53 		if (isPair)
54 		{
55 			assert(token == DSplitter.Token.none);
56 			result ~= "Pair";
57 		}
58 		if (token && DSplitter.tokenText[token])
59 			result ~= DSplitter.tokenText[token];
60 		return result.length ? result.join(" / ") : null;
61 	}
62 
63 	override string toString()
64 	{
65 		return "%(%s%) %s %(%s%)".format([head], children, [tail]);
66 	}
67 }
68 
69 enum Mode
70 {
71 	source,
72 	words,     /// split identifiers, for obfuscation
73 }
74 
75 enum Splitter
76 {
77 	files,     /// Load entire files only
78 	lines,     /// Split by line ends
79 	words,     /// Split by whitespace
80 	D,         /// Parse D source code
81 	diff,      /// Unified diffs
82 }
83 immutable string[] splitterNames = [EnumMembers!Splitter].map!(e => e.text().toLower()).array();
84 
85 struct ParseRule
86 {
87 	string pattern;
88 	Splitter splitter;
89 }
90 
91 struct ParseOptions
92 {
93 	enum Mode { source, words }
94 
95 	bool stripComments;
96 	ParseRule[] rules;
97 	Mode mode;
98 }
99 
100 /// Parse the given file/directory.
101 /// For files, modifies path to be the base name for .test / .reduced directories.
102 Entity loadFiles(ref string path, ParseOptions options)
103 {
104 	if (isFile(path))
105 	{
106 		auto filePath = path;
107 		path = stripExtension(path);
108 		return loadFile(filePath.baseName(), filePath, options);
109 	}
110 	else
111 	{
112 		auto set = new Entity();
113 		foreach (string entry; dirEntries(path, SpanMode.breadth).array.sort!((a, b) => a.name < b.name))
114 			if (isFile(entry))
115 			{
116 				assert(entry.startsWith(path));
117 				auto name = entry[path.length+1..$];
118 				set.children ~= loadFile(name, entry, options);
119 			}
120 		return set;
121 	}
122 }
123 
124 enum BIN_SIZE = 2;
125 
126 void optimize(Entity set)
127 {
128 	static void group(ref Entity[] set, size_t start, size_t end)
129 	{
130 		//set = set[0..start] ~ [new Entity(removable, set[start..end])] ~ set[end..$];
131 		auto children = set[start..end].dup;
132 		auto e = new Entity(null, children, null);
133 		e.noRemove = children.any!(c => c.noRemove)();
134 		set.replaceInPlace(start, end, [e]);
135 	}
136 
137 	static void clusterBy(ref Entity[] set, size_t binSize)
138 	{
139 		while (set.length > binSize)
140 		{
141 			auto size = set.length >= binSize*2 ? binSize : (set.length+1) / 2;
142 			//auto size = binSize;
143 
144 			auto bins = set.length/size;
145 			if (set.length % size > 1)
146 				group(set, bins*size, set.length);
147 			foreach_reverse (i; 0..bins)
148 				group(set, i*size, (i+1)*size);
149 		}
150 	}
151 
152 	static void doOptimize(Entity e)
153 	{
154 		foreach (c; e.children)
155 			doOptimize(c);
156 		clusterBy(e.children, BIN_SIZE);
157 	}
158 
159 	doOptimize(set);
160 }
161 
162 private:
163 
164 /// Override std.string nonsense, which does UTF-8 decoding
165 bool startsWith(in char[] big, in char[] small) { return big.length >= small.length && big[0..small.length] == small; }
166 bool startsWith(in char[] big, char c) { return big.length && big[0] == c; }
167 string strip(string s) { while (s.length && isWhite(s[0])) s = s[1..$]; while (s.length && isWhite(s[$-1])) s = s[0..$-1]; return s; }
168 
169 immutable ParseRule[] defaultRules =
170 [
171 	{ "*.d"    , Splitter.D     },
172 	{ "*.di"   , Splitter.D     },
173 	{ "*.diff" , Splitter.diff  },
174 	{ "*.patch", Splitter.diff  },
175 	{ "*"      , Splitter.files },
176 ];
177 
178 Entity loadFile(string name, string path, ParseOptions options)
179 {
180 	stderr.writeln("Loading ", path);
181 	auto result = new Entity();
182 	result.filename = name.replace(`\`, `/`);
183 	result.contents = cast(string)read(path);
184 
185 	auto base = name.baseName();
186 	foreach (rule; chain(options.rules, defaultRules))
187 		if (base.globMatch(rule.pattern))
188 		{
189 			final switch (rule.splitter)
190 			{
191 				case Splitter.files:
192 					result.children = [new Entity(result.contents, null, null)];
193 					return result;
194 				case Splitter.lines:
195 					result.children = parseToLines(result.contents);
196 					return result;
197 				case Splitter.words:
198 					result.children = parseToWords(result.contents);
199 					return result;
200 				case Splitter.D:
201 				{
202 					if (result.contents.startsWith("Ddoc"))
203 						goto case Splitter.files;
204 
205 					DSplitter splitter;
206 					if (options.stripComments)
207 						result.contents = splitter.stripComments(result.contents);
208 
209 					final switch (options.mode)
210 					{
211 						case ParseOptions.Mode.source:
212 							result.children = splitter.parse(result.contents);
213 							return result;
214 						case ParseOptions.Mode.words:
215 							result.children = splitter.parseToWords(result.contents);
216 							return result;
217 					}
218 				}
219 				case Splitter.diff:
220 					result.children = parseDiff(result.contents);
221 					return result;
222 			}
223 		}
224 	assert(false); // default * rule should match everything
225 }
226 
227 // *****************************************************************************************************************************************************************************
228 
229 /// A simple, error-tolerant D source code splitter.
230 struct DSplitter
231 {
232 	struct Pair { string start, end; }
233 	static const Pair[] pairs =
234 	[
235 		{ "{", "}" },
236 		{ "[", "]" },
237 		{ "(", ")" },
238 	];
239 
240 	static immutable string[] blockKeywords = ["try", "catch", "finally", "while", "do", "in", "out", "body", "if", "static if", "else", "for", "foreach"];
241 	static immutable string[] parenKeywords = ["catch", "while", "if", "static if", "for", "foreach"];
242 
243 	/// The order dictates the splitting priority of the separators.
244 	static immutable string[][] separators =
245 	[
246 		[";", "{"] ~ blockKeywords,
247 		["import"],
248 		// From http://wiki.dlang.org/Operator_precedence
249 		// Some items are listed twice, DustMite does not distinguish binary/prefix/postfix operators
250 		[".."],
251 		[","],
252 		["=>"],
253 		["=", "-=", "+=", "<<=", ">>=", ">>>=", "=", "*=", "%=", "^=", "^^=", "~="],
254 		["?", ":"],
255 		["||"],
256 		["&&"],
257 		["|"],
258 		["^"],
259 		["&"],
260 		["==", "!=", ">", "<", ">=", "<=", "!>", "!<", "!>=", "!<=", "<>", "!<>", "<>=", "!<>=", "in", "!in", "is", "!is"],
261 		["<<", ">>", ">>>"],
262 		["+", "-", "~"],
263 		["*", "/", "%"],
264 		["&", "++", "--", "*", "+", "-", /*"!",*/ "~"],
265 		["^^"],
266 		[".", "++", "--" /*, "(", "["*/],
267 		// "=>",
268 		["!"],
269 		["(", "["]
270 	];
271 
272 	enum Token : int
273 	{
274 		none,
275 		end,        /// end of stream
276 		whitespace, /// spaces, tabs, newlines
277 		comment,    /// all forms of comments
278 		other,      /// identifiers, literals and misc. keywords
279 
280 		generated0, /// First value of generated tokens (see below)
281 
282 		max = generated0 + tokenLookup.length
283 	}
284 
285 	enum Token[string] tokenLookup = // DMD pr/2824
286 	{
287 		Token[string] lookup;
288 
289 		auto t = Token.generated0;
290 		Token add(string s)
291 		{
292 			auto p = s in lookup;
293 			if (p)
294 				return *p;
295 			return lookup[s] = t++;
296 		}
297 
298 		foreach (pair; pairs)
299 		{
300 			add(pair.start);
301 			add(pair.end  );
302 		}
303 
304 		foreach (i, synonyms; separators)
305 			foreach (sep; synonyms)
306 				add(sep);
307 
308 		return lookup;
309 	}();
310 
311 	static immutable string[Token.max] tokenText =
312 	{
313 		string[Token.max] result;
314 		foreach (k, v; tokenLookup)
315 			result[v] = k;
316 		return result;
317 	}();
318 
319 	struct TokenPair { Token start, end; }
320 	static Token lookupToken(string s) { return tokenLookup[s]; }
321 	static TokenPair makeTokenPair(Pair pair) { return TokenPair(tokenLookup[pair.start], tokenLookup[pair.end]); }
322 	alias lookupTokens = arrayMap!(lookupToken, const string);
323 	static immutable TokenPair[] pairTokens      = pairs     .arrayMap!makeTokenPair();
324 	static immutable Token[][]   separatorTokens = separators.arrayMap!lookupTokens ();
325 	static immutable Token[] blockKeywordTokens = blockKeywords.arrayMap!lookupToken();
326 	static immutable Token[] parenKeywordTokens = parenKeywords.arrayMap!lookupToken();
327 
328 	enum SeparatorType
329 	{
330 		none,
331 		pair,
332 		prefix,
333 		postfix,
334 		binary,  /// infers dependency
335 	}
336 
337 	static SeparatorType getSeparatorType(Token t)
338 	{
339 		switch (t)
340 		{
341 			case tokenLookup[";"]:
342 				return SeparatorType.postfix;
343 			case tokenLookup["import"]:
344 				return SeparatorType.prefix;
345 			case tokenLookup["else"]:
346 				return SeparatorType.binary;
347 			default:
348 				if (pairTokens.any!(pair => pair.start == t))
349 					return SeparatorType.pair;
350 				if (blockKeywordTokens.canFind(t))
351 					return SeparatorType.prefix;
352 				if (separatorTokens.any!(row => row.canFind(t)))
353 					return SeparatorType.binary;
354 				return SeparatorType.none;
355 		}
356 	}
357 
358 	// ************************************************************************
359 
360 	string s; /// Source code we are splitting
361 	size_t i; /// Cursor
362 
363 	/// Making the end of an input stream a throwable greatly simplifies control flow.
364 	class EndOfInput : Throwable { this() { super(null); } }
365 
366 	/// Are we at the end of the stream?
367 	@property bool eos() { return i >= s.length; }
368 
369 	/// Advances i by n bytes. Throws EndOfInput if there are not enough.
370 	string advance(size_t n)
371 	{
372 		auto e = i + n;
373 		if (e > s.length)
374 		{
375 			i = s.length;
376 			throw new EndOfInput;
377 		}
378 		auto result = s[i..e];
379 		i = e;
380 		return result;
381 	}
382 
383 	/// ditto
384 	char advance() { return advance(1)[0]; }
385 
386 	/// If pre comes next, advance i through pre and return it.
387 	/// Otherwise, return null.
388 	string consume(string pre)
389 	{
390 		if (s[i..$].startsWith(pre))
391 			return advance(pre.length);
392 		else
393 			return null;
394 	}
395 
396 	/// ditto
397 	char consume(char pre)
398 	{
399 		assert(pre);
400 		if (s[i..$].startsWith(pre))
401 			return advance();
402 		else
403 			return 0;
404 	}
405 
406 	/// Peeks at the next n characters.
407 	string peek(size_t n)
408 	{
409 		if (i+n > s.length)
410 			throw new EndOfInput;
411 		return s[i..i+n];
412 	}
413 
414 	/// ditto
415 	char peek() { return peek(1)[0]; }
416 
417 	/// Advances i through one token (whether it's a comment,
418 	/// a series of whitespace characters, or something else).
419 	/// Returns the token type.
420 	Token skipTokenOrWS()
421 	{
422 		if (eos)
423 			return Token.end;
424 
425 		Token result = Token.other;
426 		try
427 		{
428 			auto c = advance();
429 			switch (c)
430 			{
431 			case '\'':
432 				result = Token.other;
433 				if (consume('\\'))
434 					advance();
435 				while (advance() != '\'')
436 					continue;
437 				break;
438 			case '\\':  // D1 naked escaped string
439 				result = Token.other;
440 				advance();
441 				break;
442 			case '"':
443 				result = Token.other;
444 				while (peek() != '"')
445 				{
446 					if (advance() == '\\')
447 						advance();
448 				}
449 				advance();
450 				break;
451 			case 'r':
452 				if (consume(`r"`))
453 				{
454 					result = Token.other;
455 					while (advance() != '"')
456 						continue;
457 					break;
458 				}
459 				else
460 					goto default;
461 			case '`':
462 				result = Token.other;
463 				while (advance() != '`')
464 					continue;
465 				break;
466 			case '/':
467 				if (consume('/'))
468 				{
469 					result = Token.comment;
470 					while (peek() != '\r' && peek() != '\n')
471 						advance();
472 				}
473 				else
474 				if (consume('*'))
475 				{
476 					result = Token.comment;
477 					while (!consume("*/"))
478 						advance();
479 				}
480 				else
481 				if (consume('+'))
482 				{
483 					result = Token.comment;
484 					int commentLevel = 1;
485 					while (commentLevel)
486 					{
487 						if (consume("/+"))
488 							commentLevel++;
489 						else
490 						if (consume("+/"))
491 							commentLevel--;
492 						else
493 							advance();
494 					}
495 				}
496 				else
497 					goto default;
498 				break;
499 			case '@':
500 				if (consume("disable")
501 				 || consume("property")
502 				 || consume("safe")
503 				 || consume("trusted")
504 				 || consume("system")
505 				)
506 					return Token.other;
507 				goto default;
508 			case '#':
509 				result = Token.other;
510 				do
511 				{
512 					c = advance();
513 					if (c == '\\')
514 						c = advance();
515 				}
516 				while (c != '\n');
517 				break;
518 			default:
519 				{
520 					i--;
521 					Token best;
522 					size_t bestLength;
523 					foreach (Token t; Token.init..Token.max)
524 					{
525 						auto text = tokenText[t];
526 						if (!text)
527 							continue;
528 						if (!s[i..$].startsWith(text))
529 							continue;
530 						if (text[$-1].isAlphaNum() && s.length > i + text.length && s[i + text.length].isAlphaNum())
531 							continue; // if the token is a word, it must end at a word boundary
532 						if (text.length <= bestLength)
533 							continue;
534 						best = t;
535 						bestLength = text.length;
536 					}
537 					if (bestLength)
538 					{
539 						auto consumed = consume(tokenText[best]);
540 						assert(consumed);
541 						return best;
542 					}
543 
544 					i++;
545 				}
546 				if (c.isWhite())
547 				{
548 					result = Token.whitespace;
549 					while (peek().isWhite())
550 						advance();
551 				}
552 				else
553 				if (isWordChar(c))
554 				{
555 					result = Token.other;
556 					while (isWordChar(peek()))
557 						advance();
558 				}
559 				else
560 					result = Token.other;
561 			}
562 		}
563 		catch (EndOfInput)
564 			i = s.length;
565 		return result;
566 	}
567 
568 	/// Skips leading and trailing whitespace/comments, too.
569 	/// Never returns Token.whitespace or Token.comment.
570 	void readToken(out Token type, out string span)
571 	{
572 		size_t spanStart = i;
573 		do
574 			type = skipTokenOrWS();
575 		while (type == Token.whitespace || type == Token.comment);
576 		skipToEOL();
577 		span = s[spanStart..i];
578 		if (type == Token.end && span.length)
579 			type = Token.whitespace;
580 	}
581 
582 	/// Moves i forward over first series of EOL characters,
583 	/// or until first non-whitespace character, whichever comes first.
584 	void skipToEOL()
585 	{
586 		try
587 			while (true)
588 			{
589 				auto c = peek();
590 				if (c == '\r' || c == '\n')
591 				{
592 					do
593 						advance(), c = peek();
594 					while (c == '\r' || c == '\n');
595 					return;
596 				}
597 				else
598 				if (c.isWhite())
599 					i++;
600 				else
601 				if (peek(2) == "//")
602 				{
603 					auto t = skipTokenOrWS();
604 					assert(t == Token.comment);
605 				}
606 				else
607 					break;
608 			}
609 		catch (EndOfInput)
610 			i = s.length;
611 	}
612 
613 	static bool isWordChar(char c)
614 	{
615 	    return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_';
616 	}
617 
618 	void reset(string code)
619 	{
620 		s = code;
621 		i = 0;
622 	}
623 
624 	void parseScope(Entity result, Token scopeEnd)
625 	{
626 		enum Level : int
627 		{
628 			zero,
629 			separator0,
630 			separatorMax = separator0 + separators.length - 1,
631 			text,
632 			max
633 		}
634 
635 		Entity[][Level.max] splitterQueue;
636 
637 		Entity[] terminateLevel(Level level)
638 		{
639 			level++;
640 			if (level == Level.max)
641 				return null;
642 
643 			auto r = splitterQueue[level] ~ group(terminateLevel(level));
644 			splitterQueue[level] = null;
645 			return r;
646 		}
647 
648 		tokenLoop:
649 		while (true)
650 		{
651 			Token token;
652 			string span;
653 			readToken(token, span);
654 
655 			if (token == Token.end)
656 			{
657 				assert(span.empty);
658 				break;
659 			}
660 			if (token == scopeEnd)
661 			{
662 				result.tail = span;
663 				break;
664 			}
665 
666 			auto e = new Entity();
667 			e.token = token;
668 			auto level = Level.text;
669 
670 			Entity after;
671 
672 			foreach (n, synonyms; separatorTokens)
673 				foreach (sep; synonyms)
674 					if (token == sep)
675 					{
676 						level = cast(Level)(Level.separator0 + n);
677 						e.children = terminateLevel(level);
678 						auto type = getSeparatorType(token);
679 						if (type == SeparatorType.prefix || type == SeparatorType.pair)
680 						{
681 							Entity empty = e;
682 							if (e.children.length)
683 							{
684 								e.token = Token.none;
685 								after = empty = new Entity(span);
686 								after.token = token;
687 							}
688 							else
689 								e.head = span;
690 
691 							if (type == SeparatorType.pair)
692 								parseScope(empty, pairTokens.find!(pair => pair.start == token).front.end);
693 						}
694 						else
695 							e.tail = span;
696 						goto handled;
697 					}
698 
699 			e.head = span;
700 
701 		handled:
702 			splitterQueue[level] ~= e;
703 			if (after)
704 				splitterQueue[level] ~= after;
705 		}
706 
707 		result.children ~= terminateLevel(Level.zero);
708 	}
709 
710 	Entity[] parse(string code)
711 	{
712 		reset(code);
713 		auto entity = new Entity;
714 		parseScope(entity, Token.none);
715 		assert(!entity.head && !entity.tail);
716 		postProcess(entity.children);
717 		return [entity];
718 	}
719 
720 	Entity[] parseToWords(string code)
721 	{
722 		reset(code);
723 		Entity[] result;
724 		while (!eos)
725 		{
726 			auto start = i;
727 			auto token = skipTokenOrWS();
728 			auto span = s[start..i];
729 
730 			if (token == Token.other)
731 				result ~= new Entity(span);
732 			else
733 			{
734 				if (!result.length)
735 					result ~= new Entity();
736 				if (!result[$-1].tail)
737 					result[$-1].tail = span;
738 				else
739 				{
740 					start = result[$-1].tail.ptr - s.ptr;
741 					result[$-1].tail = s[start..i];
742 				}
743 			}
744 		}
745 		return result;
746 	}
747 
748 	string stripComments(string code)
749 	{
750 		reset(code);
751 		auto result = appender!string();
752 		while (!eos)
753 		{
754 			auto start = i;
755 			Token t = skipTokenOrWS();
756 			if (t != Token.comment)
757 				result.put(s[start..i]);
758 		}
759 		return result.data;
760 	}
761 
762 	static Entity[] group(Entity[] entities)
763 	{
764 		if (entities.length <= 1)
765 			return entities;
766 		return [new Entity(null, entities, null)];
767 	}
768 
769 	static void postProcessSimplify(ref Entity[] entities)
770 	{
771 		for (size_t i=0; i<entities.length;)
772 		{
773 			auto e = entities[i];
774 			if (e.head.empty && e.tail.empty && e.dependencies.empty)
775 			{
776 				assert(e.token == Token.none);
777 				if (e.children.length == 0)
778 				{
779 					entities = entities.remove(i);
780 					continue;
781 				}
782 				else
783 				if (e.children.length == 1)
784 				{
785 					entities[i] = e.children[0];
786 					continue;
787 				}
788 			}
789 
790 			i++;
791 		}
792 	}
793 
794 	static void postProcessDependency(ref Entity[] entities)
795 	{
796 		if (entities.length < 2)
797 		{
798 			foreach (e; entities)
799 				postProcessDependency(e.children);
800 			return;
801 		}
802 
803 		size_t[] points;
804 		foreach_reverse (i, e; entities[0..$-1])
805 			if (getSeparatorType(e.token) == SeparatorType.binary && e.children)
806 				points ~= i;
807 
808 		if (points.length)
809 		{
810 			auto i = points[$/2];
811 			auto e = entities[i];
812 
813 			auto head = entities[0..i] ~ group(e.children);
814 			e.children = null;
815 			auto tail = new Entity(null, group(entities[i+1..$]), null);
816 			e.dependencies ~= tail;
817 			entities = group(head ~ e) ~ tail;
818 			foreach (c; entities)
819 				postProcessDependency(c.children);
820 		}
821 	}
822 
823 	static void postProcessTemplates(ref Entity[] entities)
824 	{
825 		if (!entities.length)
826 			return;
827 		foreach_reverse (i, e; entities[0..$-1])
828 			if (e.token == tokenLookup["!"] && entities[i+1].children.length && entities[i+1].children[0].token == tokenLookup["("])
829 			{
830 				auto dependency = new Entity;
831 				e.dependencies ~= dependency;
832 				entities[i+1].children[0].dependencies ~= dependency;
833 				entities = entities[0..i+1] ~ dependency ~ entities[i+1..$];
834 			}
835 	}
836 
837 	static void postProcessDependencyBlock(ref Entity[] entities)
838 	{
839 		foreach (i, e; entities)
840 			if (i && !e.token && e.children.length && getSeparatorType(e.children[0].token) == SeparatorType.binary && !e.children[0].children)
841 				e.children[0].dependencies ~= entities[i-1];
842 	}
843 
844 	static void postProcessBlockKeywords(ref Entity[] entities)
845 	{
846 		for (size_t i=0; i<entities.length;)
847 		{
848 			if (blockKeywordTokens.canFind(entities[i].token) && i+1 < entities.length)
849 			{
850 				auto j = i + 1;
851 				if (j < entities.length && entities[j].token == tokenLookup["("])
852 					j++;
853 				j++; // ; or {
854 				if (j <= entities.length)
855 				{
856 					entities = entities[0..i] ~ group(group(entities[i..j-1]) ~ entities[j-1..j]) ~ entities[j..$];
857 					continue;
858 				}
859 			}
860 
861 			i++;
862 		}
863 	}
864 
865 	static void postProcessBlockStatements(ref Entity[] entities)
866 	{
867 		for (size_t i=0; i<entities.length;)
868 		{
869 			auto j = i;
870 			bool consume(Token t)
871 			{
872 				if (j < entities.length
873 				 && entities[j].children.length == 2
874 				 && firstToken(entities[j].children[0]) == t)
875 				{
876 				 	j++;
877 				 	return true;
878 				}
879 				return false;
880 			}
881 			
882 			if (consume(tokenLookup["if"]) || consume(tokenLookup["static if"]))
883 				consume(tokenLookup["else"]);
884 			else
885 			if (consume(tokenLookup["do"]))
886 				consume(tokenLookup["while"]);
887 			else
888 			if (consume(tokenLookup["try"]))
889 			{
890 				while (consume(tokenLookup["catch"]))
891 					continue;
892 				consume(tokenLookup["finally"]);
893 			}
894 
895 			if (i == j)
896 			{
897 				j++;
898 				while (consume(tokenLookup["in"]) || consume(tokenLookup["out"]) || consume(tokenLookup["body"]))
899 					continue;
900 			}
901 
902 			if (j-i > 1)
903 			{
904 				entities = entities[0..i] ~ group(entities[i..j]) ~ entities[j..$];
905 				continue;
906 			}
907 
908 			i++;
909 		}
910 	}
911 
912 	static void postProcessPairs(ref Entity[] entities)
913 	{
914 		size_t lastPair = 0;
915 
916 		for (size_t i=0; i<entities.length;)
917 		{
918 			// Create pair entities
919 
920 			if (entities[i].token == tokenLookup["{"])
921 			{
922 				if (i >= lastPair + 1)
923 				{
924 					lastPair = i-1;
925 					entities = entities[0..lastPair] ~ group(group(entities[lastPair..i]) ~ entities[i]) ~ entities[i+1..$];
926 					i = lastPair;
927 					entities[i].isPair = true;
928 					lastPair++;
929 					continue;
930 				}
931 				else
932 					lastPair = i + 1;
933 			}
934 			else
935 			if (entities[i].token == tokenLookup[";"])
936 				lastPair = i + 1;
937 
938 			i++;
939 		}
940 	}
941 
942 	static void postProcessParens(ref Entity[] entities)
943 	{
944 		for (size_t i=0; i+1 < entities.length;)
945 		{
946 			if (parenKeywordTokens.canFind(entities[i].token))
947 			{
948 				auto pparen = firstHead(entities[i+1]);
949 				if (pparen
950 				 && *pparen !is entities[i+1]
951 				 && pparen.token == tokenLookup["("])
952 				{
953 					auto paren = *pparen;
954 					*pparen = new Entity();
955 					entities = entities[0..i] ~ group([entities[i], paren]) ~ entities[i+1..$];
956 					continue;
957 				}
958 			}
959 
960 			i++;
961 		}
962 
963 		foreach (e; entities)
964 			postProcessParens(e.children);
965 	}
966 
967 	static bool isValidIdentifier(string s)
968 	{
969 		if (!s.length)
970 			return false;
971 		if (!isAlpha(s[0]))
972 			return false;
973 		foreach (c; s[1..$])
974 			if (!isAlphaNum(c))
975 				return false;
976 		return true;
977 	}
978 
979 	/// Get all nodes between (exclusively) two addresses.
980 	/// If either address is empty, then the respective bound is the respective extreme.
981 	static Entity[] nodesBetween(Entity root, size_t[] a, size_t[] b)
982 	{
983 		while (a.length && b.length && a[0] == b[0])
984 		{
985 			root = root.children[a[0]];
986 			a = a[1..$];
987 			b = b[1..$];
988 		}
989 		size_t index0, index1;
990 		Entity[] children0, children1;
991 		if (a.length)
992 		{
993 			index0 = a[0] + 1;
994 			if (a.length > 1)
995 				children0 = nodesBetween(root.children[a[0]], a[1..$], null);
996 		}
997 		else
998 			index0 = 0;
999 
1000 		if (b.length)
1001 		{
1002 			index1 = b[0];
1003 			if (b.length > 1)
1004 				children1 = nodesBetween(root.children[b[0]], null, b[1..$]);
1005 		}
1006 		else
1007 			index1 = root.children.length;
1008 
1009 		assert(index0 <= index1);
1010 		return children0 ~ root.children[index0 .. index1] ~ children1;
1011 	}
1012 
1013 	static void postProcessRecursive(ref Entity[] entities)
1014 	{
1015 		foreach (e; entities)
1016 			if (e.children.length)
1017 				postProcessRecursive(e.children);
1018 
1019 		postProcessSimplify(entities);
1020 		postProcessTemplates(entities);
1021 		postProcessDependency(entities);
1022 		postProcessBlockKeywords(entities);
1023 		postProcessDependencyBlock(entities);
1024 		postProcessBlockStatements(entities);
1025 		postProcessPairs(entities);
1026 		postProcessParens(entities);
1027 	}
1028 
1029 	/// Attempt to link together function arguments / parameters for
1030 	/// things that look like calls to the same function, to allow removing
1031 	/// unused function arguments / parameters.
1032 	static void postProcessArgs(ref Entity[] entities)
1033 	{
1034 		string lastID;
1035 
1036 		Entity[][][string] calls;
1037 
1038 		void visit(Entity entity)
1039 		{
1040 			auto id = entity.head.strip();
1041 			if (entity.token == Token.other && isValidIdentifier(id) && !entity.tail && !entity.children)
1042 				lastID = id;
1043 			else
1044 			if (lastID && entity.token == tokenLookup["("])
1045 			{
1046 				size_t[] stack;
1047 				struct Comma { size_t[] addr, after; }
1048 				Comma[] commas;
1049 
1050 				bool afterComma;
1051 
1052 				// Find all top-level commas
1053 				void visit2(size_t i, Entity entity)
1054 				{
1055 					stack ~= i;
1056 					if (afterComma)
1057 					{
1058 						commas[$-1].after = stack;
1059 						//entity.comments ~= "After-comma %d".format(commas.length);
1060 						afterComma = false;
1061 					}
1062 
1063 					if (entity.token == tokenLookup[","])
1064 					{
1065 						commas ~= Comma(stack);
1066 						//entity.comments ~= "Comma %d".format(commas.length);
1067 						afterComma = true;
1068 					}
1069 					else
1070 					if (entity.head.length || entity.tail.length)
1071 						{}
1072 					else
1073 						foreach (j, child; entity.children)
1074 							visit2(j, child);
1075 					stack = stack[0..$-1];
1076 				}
1077 
1078 				foreach (i, child; entity.children)
1079 					visit2(i, child);
1080 
1081 				// Find all nodes between commas, effectively obtaining the arguments
1082 				size_t[] last = null;
1083 				commas ~= [Comma()];
1084 				Entity[][] args;
1085 				foreach (i, comma; commas)
1086 				{
1087 					//Entity entityAt(Entity root, size_t[] address) { return address.length ? entityAt(root.children[address[0]], address[1..$]) : root; }
1088 					//entityAt(entity, last).comments ~= "nodesBetween-left %d".format(i);
1089 					//entityAt(entity, comma.after).comments ~= "nodesBetween-right %d".format(i);
1090 					args ~= nodesBetween(entity, last, comma.after);
1091 					last = comma.addr;
1092 				}
1093 
1094 				// Register the arguments
1095 				foreach (i, arg; args)
1096 				{
1097 					debug
1098 						foreach (j, e; arg)
1099 							e.comments ~= "%s arg %d node %d".format(lastID, i, j);
1100 
1101 					if (arg.length == 1)
1102 					{
1103 						if (lastID !in calls)
1104 							calls[lastID] = null;
1105 						while (calls[lastID].length < i+1)
1106 							calls[lastID] ~= null;
1107 						calls[lastID][i] ~= arg[0];
1108 					}
1109 				}
1110 
1111 				lastID = null;
1112 				return;
1113 			}
1114 			else
1115 			if (entity.token == tokenLookup["!"])
1116 				{}
1117 			else
1118 			if (entity.head || entity.tail)
1119 				lastID = null;
1120 
1121 			foreach (child; entity.children)
1122 				visit(child);
1123 		}
1124 
1125 		foreach (entity; entities)
1126 			visit(entity);
1127 
1128 		// For each parameter, create a dummy empty node which is a dependency for all of the arguments.
1129 		auto callRoot = new Entity();
1130 		debug callRoot.comments ~= "Args root";
1131 		entities ~= callRoot;
1132 
1133 		foreach (id, params; calls)
1134 		{
1135 			auto funRoot = new Entity();
1136 			debug funRoot.comments ~= "%s root".format(id);
1137 			callRoot.children ~= funRoot;
1138 
1139 			foreach (i, args; params)
1140 			{
1141 				auto e = new Entity();
1142 				debug e.comments ~= "%s param %d".format(id, i);
1143 				funRoot.children ~= e;
1144 				foreach (arg; args)
1145 					arg.dependencies ~= e;
1146 			}
1147 		}
1148 	}
1149 
1150 	static void postProcess(ref Entity[] entities)
1151 	{
1152 		postProcessRecursive(entities);
1153 		postProcessArgs(entities);
1154 	}
1155 
1156 	static Entity* firstHead(ref Entity e)
1157 	{
1158 		if (e.head.length)
1159 			return &e;
1160 		foreach (ref c; e.children)
1161 		{
1162 			auto r = firstHead(c);
1163 			if (r)
1164 				return r;
1165 		}
1166 		return null;
1167 	}
1168 
1169 	static Token firstToken(Entity e)
1170 	{
1171 		while (!e.token && e.children.length)
1172 			e = e.children[0];
1173 		return e.token;
1174 	}
1175 }
1176 
1177 public:
1178 
1179 /// Split the text into sequences for each fun is always true, and then always false
1180 Entity[] parseSplit(alias fun)(string text)
1181 {
1182 	Entity[] result;
1183 	size_t i, wordStart, wordEnd;
1184 	for (i = 1; i <= text.length; i++)
1185 		if (i==text.length || (fun(text[i-1]) && !fun(text[i])))
1186 		{
1187 			if (wordStart != i)
1188 				result ~= new Entity(text[wordStart..wordEnd], null, text[wordEnd..i]);
1189 			wordStart = wordEnd = i;
1190 		}
1191 		else
1192 		if ((!fun(text[i-1]) && fun(text[i])))
1193 			wordEnd = i;
1194 	return result;
1195 }
1196 
1197 alias parseToWords = parseSplit!isNotAlphaNum;
1198 alias parseToLines = parseSplit!isNewline;
1199 
1200 /// Split s on end~start, preserving end and start on each chunk
1201 private string[] split2(string end, string start)(string s)
1202 {
1203 	enum sep = end ~ start;
1204 	return split2Impl(s, sep, end.length);
1205 }
1206 
1207 private string[] split2Impl(string s, string sep, size_t endLength)
1208 {
1209 	string[] result;
1210 	while (true)
1211 	{
1212 		auto i = s.indexOf(sep);
1213 		if (i < 0)
1214 			return result ~ s;
1215 		i += endLength;
1216 		result ~= s[0..i];
1217 		s = s[i..$];
1218 	}
1219 }
1220 
1221 unittest
1222 {
1223 	assert(split2!("]", "[")(null) == [""]);
1224 	assert(split2!("]", "[")("[foo]") == ["[foo]"]);
1225 	assert(split2!("]", "[")("[foo][bar]") == ["[foo]", "[bar]"]);
1226 	assert(split2!("]", "[")("[foo] [bar]") == ["[foo] [bar]"]);
1227 }
1228 
1229 Entity[] parseDiff(string s)
1230 {
1231 	return s
1232 		.split2!("\n", "diff ")
1233 		.map!(
1234 			(string file)
1235 			{
1236 				auto chunks = file.split2!("\n", "@@ ");
1237 				return new Entity(chunks[0], chunks[1..$].map!(chunk => new Entity(chunk)).array);
1238 			}
1239 		)
1240 		.array
1241 	;
1242 }
1243 
1244 private:
1245 
1246 bool isNewline(char c) { return c == '\r' || c == '\n'; }
1247 alias isNotAlphaNum = not!isAlphaNum;
1248 
1249 // https://d.puremagic.com/issues/show_bug.cgi?id=11824
1250 auto arrayMap(alias fun, T)(T[] arr)
1251 {
1252 	alias R = typeof(fun(arr[0]));
1253 	auto result = new R[arr.length];
1254 	foreach (i, v; arr)
1255 		result[i] = fun(v);
1256 	return result;
1257 }