1 // Written in the D programming language.
2 
3 /++
4     Functions which operate on ASCII characters.
5 
6     All of the functions in std._ascii accept Unicode characters but
7     effectively ignore them if they're not ASCII. All $(D isX) functions return
8     $(D false) for non-ASCII characters, and all $(D toX) functions do nothing
9     to non-ASCII characters.
10 
11     For functions which operate on Unicode characters, see
12     $(MREF std, uni).
13 
14 $(SCRIPT inhibitQuickIndex = 1;)
15 $(DIVC quickindex,
16 $(BOOKTABLE,
17 $(TR $(TH Category) $(TH Functions))
18 $(TR $(TD Validation) $(TD
19         $(LREF isAlpha)
20         $(LREF isAlphaNum)
21         $(LREF isASCII)
22         $(LREF isControl)
23         $(LREF isDigit)
24         $(LREF isGraphical)
25         $(LREF isHexDigit)
26         $(LREF isOctalDigit)
27         $(LREF isPrintable)
28         $(LREF isPunctuation)
29         $(LREF isUpper)
30         $(LREF isWhite)
31 ))
32 $(TR $(TD Conversions) $(TD
33         $(LREF toLower)
34         $(LREF toUpper)
35 ))
36 $(TR $(TD Constants) $(TD
37         $(LREF digits)
38         $(LREF fullHexDigits)
39         $(LREF hexDigits)
40         $(LREF letters)
41         $(LREF lowercase)
42         $(LREF lowerHexDigits)
43         $(LREF newline)
44         $(LREF octalDigits)
45         $(LREF uppercase)
46         $(LREF whitespace)
47 ))
48 $(TR $(TD Enums) $(TD
49         $(LREF LetterCase)
50 ))
51 ))
52     References:
53         $(LINK2 http://www.digitalmars.com/d/ascii-table.html, ASCII Table),
54         $(HTTP en.wikipedia.org/wiki/Ascii, Wikipedia)
55 
56     License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
57     Authors:   $(HTTP digitalmars.com, Walter Bright) and Jonathan M Davis
58     Source:    $(PHOBOSSRC std/_ascii.d)
59   +/
60 module std.ascii;
61 
62 version (unittest)
63 {
64     // FIXME: When dmd bug #314 is fixed, make these selective.
65     import std.meta; // : AliasSeq;
66     import std.range; // : chain;
67     import std.traits; // : functionAttributes, FunctionAttribute, isSafe;
68 }
69 
70 
71 immutable fullHexDigits  = "0123456789ABCDEFabcdef";     /// 0 .. 9A .. Fa .. f
72 immutable hexDigits      = fullHexDigits[0 .. 16];         /// 0 .. 9A .. F
73 immutable lowerHexDigits = "0123456789abcdef";           /// 0 .. 9a .. f
74 immutable digits         = hexDigits[0 .. 10];             /// 0 .. 9
75 immutable octalDigits    = digits[0 .. 8];                 /// 0 .. 7
76 immutable letters        = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; /// A .. Za .. z
77 immutable uppercase      = letters[0 .. 26];               /// A .. Z
78 immutable lowercase      = letters[26 .. 52];              /// a .. z
79 immutable whitespace     = " \t\v\r\n\f";                /// ASCII _whitespace
80 
81 /++
82     Letter case specifier.
83   +/
84 enum LetterCase : bool
85 {
86     upper, /// Upper case letters
87     lower  /// Lower case letters
88 }
89 
90 /// Newline sequence for this system.
91 version(Windows)
92     immutable newline = "\r\n";
93 else version(Posix)
94     immutable newline = "\n";
95 else
96     static assert(0, "Unsupported OS");
97 
98 
99 /++
100     Params: c = The character to test.
101     Returns: Whether $(D c) is a letter or a number (0 .. 9, a .. z, A .. Z).
102   +/
103 bool isAlphaNum(dchar c) @safe pure nothrow @nogc
104 {
105     return c <= 'z' && c >= '0' && (c <= '9' || c >= 'a' || (c >= 'A' && c <= 'Z'));
106 }
107 
108 ///
109 @safe pure nothrow @nogc unittest
110 {
111     assert( isAlphaNum('A'));
112     assert( isAlphaNum('1'));
113     assert(!isAlphaNum('#'));
114 
115     // N.B.: does not return true for non-ASCII Unicode alphanumerics:
116     assert(!isAlphaNum('á'));
117 }
118 
119 @safe unittest
120 {
121     foreach (c; chain(digits, octalDigits, fullHexDigits, letters, lowercase, uppercase))
122         assert(isAlphaNum(c));
123 
124     foreach (c; whitespace)
125         assert(!isAlphaNum(c));
126 }
127 
128 
129 /++
130     Params: c = The character to test.
131     Returns: Whether $(D c) is an ASCII letter (A .. Z, a .. z).
132   +/
133 bool isAlpha(dchar c) @safe pure nothrow @nogc
134 {
135     // Optimizer can turn this into a bitmask operation on 64 bit code
136     return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
137 }
138 
139 ///
140 @safe pure nothrow @nogc unittest
141 {
142     assert( isAlpha('A'));
143     assert(!isAlpha('1'));
144     assert(!isAlpha('#'));
145 
146     // N.B.: does not return true for non-ASCII Unicode alphabetic characters:
147     assert(!isAlpha('á'));
148 }
149 
150 @safe unittest
151 {
152     foreach (c; chain(letters, lowercase, uppercase))
153         assert(isAlpha(c));
154 
155     foreach (c; chain(digits, octalDigits, whitespace))
156         assert(!isAlpha(c));
157 }
158 
159 
160 /++
161     Params: c = The character to test.
162     Returns: Whether $(D c) is a lowercase ASCII letter (a .. z).
163   +/
164 bool isLower(dchar c) @safe pure nothrow @nogc
165 {
166     return c >= 'a' && c <= 'z';
167 }
168 
169 ///
170 @safe pure nothrow @nogc unittest
171 {
172     assert( isLower('a'));
173     assert(!isLower('A'));
174     assert(!isLower('#'));
175 
176     // N.B.: does not return true for non-ASCII Unicode lowercase letters
177     assert(!isLower('á'));
178     assert(!isLower('Á'));
179 }
180 
181 @safe unittest
182 {
183     foreach (c; lowercase)
184         assert(isLower(c));
185 
186     foreach (c; chain(digits, uppercase, whitespace))
187         assert(!isLower(c));
188 }
189 
190 
191 /++
192     Params: c = The character to test.
193     Returns: Whether $(D c) is an uppercase ASCII letter (A .. Z).
194   +/
195 bool isUpper(dchar c) @safe pure nothrow @nogc
196 {
197     return c <= 'Z' && 'A' <= c;
198 }
199 
200 ///
201 @safe pure nothrow @nogc unittest
202 {
203     assert( isUpper('A'));
204     assert(!isUpper('a'));
205     assert(!isUpper('#'));
206 
207     // N.B.: does not return true for non-ASCII Unicode uppercase letters
208     assert(!isUpper('á'));
209     assert(!isUpper('Á'));
210 }
211 
212 @safe unittest
213 {
214     foreach (c; uppercase)
215         assert(isUpper(c));
216 
217     foreach (c; chain(digits, lowercase, whitespace))
218         assert(!isUpper(c));
219 }
220 
221 
222 /++
223     Params: c = The character to test.
224     Returns: Whether $(D c) is a digit (0 .. 9).
225   +/
226 bool isDigit(dchar c) @safe pure nothrow @nogc
227 {
228     return '0' <= c && c <= '9';
229 }
230 
231 ///
232 @safe pure nothrow @nogc unittest
233 {
234     assert( isDigit('3'));
235     assert( isDigit('8'));
236     assert(!isDigit('B'));
237     assert(!isDigit('#'));
238 
239     // N.B.: does not return true for non-ASCII Unicode numbers
240     assert(!isDigit('0')); // full-width digit zero (U+FF10)
241     assert(!isDigit('4')); // full-width digit four (U+FF14)
242 }
243 
244 @safe unittest
245 {
246     foreach (c; digits)
247         assert(isDigit(c));
248 
249     foreach (c; chain(letters, whitespace))
250         assert(!isDigit(c));
251 }
252 
253 
254 /++
255     Params: c = The character to test.
256     Returns: Whether $(D c) is a digit in base 8 (0 .. 7).
257   +/
258 bool isOctalDigit(dchar c) @safe pure nothrow @nogc
259 {
260     return c >= '0' && c <= '7';
261 }
262 
263 ///
264 @safe pure nothrow @nogc unittest
265 {
266     assert( isOctalDigit('0'));
267     assert( isOctalDigit('7'));
268     assert(!isOctalDigit('8'));
269     assert(!isOctalDigit('A'));
270     assert(!isOctalDigit('#'));
271 }
272 
273 @safe unittest
274 {
275     foreach (c; octalDigits)
276         assert(isOctalDigit(c));
277 
278     foreach (c; chain(letters, ['8', '9'], whitespace))
279         assert(!isOctalDigit(c));
280 }
281 
282 
283 /++
284     Params: c = The character to test.
285     Returns: Whether $(D c) is a digit in base 16 (0 .. 9, A .. F, a .. f).
286   +/
287 bool isHexDigit(dchar c) @safe pure nothrow @nogc
288 {
289     return c <= 'f' && c >= '0' && (c <= '9' || c >= 'a' || (c >= 'A' && c <= 'F'));
290 }
291 
292 ///
293 @safe pure nothrow @nogc unittest
294 {
295     assert( isHexDigit('0'));
296     assert( isHexDigit('A'));
297     assert( isHexDigit('f')); // lowercase hex digits are accepted
298     assert(!isHexDigit('g'));
299     assert(!isHexDigit('G'));
300     assert(!isHexDigit('#'));
301 }
302 
303 @safe unittest
304 {
305     foreach (c; fullHexDigits)
306         assert(isHexDigit(c));
307 
308     foreach (c; chain(lowercase[6 .. $], uppercase[6 .. $], whitespace))
309         assert(!isHexDigit(c));
310 }
311 
312 
313 /++
314     Params: c = The character to test.
315     Returns: Whether or not $(D c) is a whitespace character. That includes the
316     space, tab, vertical tab, form feed, carriage return, and linefeed
317     characters.
318   +/
319 bool isWhite(dchar c) @safe pure nothrow @nogc
320 {
321     return c == ' ' || (c >= 0x09 && c <= 0x0D);
322 }
323 
324 ///
325 @safe pure nothrow @nogc unittest
326 {
327     assert( isWhite(' '));
328     assert( isWhite('\t'));
329     assert( isWhite('\n'));
330     assert(!isWhite('1'));
331     assert(!isWhite('a'));
332     assert(!isWhite('#'));
333 
334     // N.B.: Does not return true for non-ASCII Unicode whitespace characters.
335     static import std.uni;
336     assert(std.uni.isWhite('\u00A0'));
337     assert(!isWhite('\u00A0')); // std.ascii.isWhite
338 }
339 
340 @safe unittest
341 {
342     foreach (c; whitespace)
343         assert(isWhite(c));
344 
345     foreach (c; chain(digits, letters))
346         assert(!isWhite(c));
347 }
348 
349 
350 /++
351     Params: c = The character to test.
352     Returns: Whether $(D c) is a control character.
353   +/
354 bool isControl(dchar c) @safe pure nothrow @nogc
355 {
356     return c < 0x20 || c == 0x7F;
357 }
358 
359 ///
360 @safe pure nothrow @nogc unittest
361 {
362     assert( isControl('\0'));
363     assert( isControl('\022'));
364     assert( isControl('\n')); // newline is both whitespace and control
365     assert(!isControl(' '));
366     assert(!isControl('1'));
367     assert(!isControl('a'));
368     assert(!isControl('#'));
369 
370     // N.B.: non-ASCII Unicode control characters are not recognized:
371     assert(!isControl('\u0080'));
372     assert(!isControl('\u2028'));
373     assert(!isControl('\u2029'));
374 }
375 
376 @safe unittest
377 {
378     foreach (dchar c; 0 .. 32)
379         assert(isControl(c));
380     assert(isControl(127));
381 
382     foreach (c; chain(digits, letters, [' ']))
383         assert(!isControl(c));
384 }
385 
386 
387 /++
388     Params: c = The character to test.
389     Returns: Whether or not $(D c) is a punctuation character. That includes
390     all ASCII characters which are not control characters, letters, digits, or
391     whitespace.
392   +/
393 bool isPunctuation(dchar c) @safe pure nothrow @nogc
394 {
395     return c <= '~' && c >= '!' && !isAlphaNum(c);
396 }
397 
398 ///
399 @safe pure nothrow @nogc unittest
400 {
401     assert( isPunctuation('.'));
402     assert( isPunctuation(','));
403     assert( isPunctuation(':'));
404     assert( isPunctuation('!'));
405     assert( isPunctuation('#'));
406     assert( isPunctuation('~'));
407     assert( isPunctuation('+'));
408     assert( isPunctuation('_'));
409 
410     assert(!isPunctuation('1'));
411     assert(!isPunctuation('a'));
412     assert(!isPunctuation(' '));
413     assert(!isPunctuation('\n'));
414     assert(!isPunctuation('\0'));
415 
416     // N.B.: Non-ASCII Unicode punctuation characters are not recognized.
417     assert(!isPunctuation('\u2012')); // (U+2012 = en-dash)
418 }
419 
420 @safe unittest
421 {
422     foreach (dchar c; 0 .. 128)
423     {
424         if (isControl(c) || isAlphaNum(c) || c == ' ')
425             assert(!isPunctuation(c));
426         else
427             assert(isPunctuation(c));
428     }
429 }
430 
431 
432 /++
433     Params: c = The character to test.
434     Returns: Whether or not $(D c) is a printable character other than the
435     space character.
436   +/
437 bool isGraphical(dchar c) @safe pure nothrow @nogc
438 {
439     return '!' <= c && c <= '~';
440 }
441 
442 ///
443 @safe pure nothrow @nogc unittest
444 {
445     assert( isGraphical('1'));
446     assert( isGraphical('a'));
447     assert( isGraphical('#'));
448     assert(!isGraphical(' ')); // whitespace is not graphical
449     assert(!isGraphical('\n'));
450     assert(!isGraphical('\0'));
451 
452     // N.B.: Unicode graphical characters are not regarded as such.
453     assert(!isGraphical('á'));
454 }
455 
456 @safe unittest
457 {
458     foreach (dchar c; 0 .. 128)
459     {
460         if (isControl(c) || c == ' ')
461             assert(!isGraphical(c));
462         else
463             assert(isGraphical(c));
464     }
465 }
466 
467 
468 /++
469     Params: c = The character to test.
470     Returns: Whether or not $(D c) is a printable character - including the
471     space character.
472   +/
473 bool isPrintable(dchar c) @safe pure nothrow @nogc
474 {
475     return c >= ' ' && c <= '~';
476 }
477 
478 ///
479 @safe pure nothrow @nogc unittest
480 {
481     assert( isPrintable(' '));  // whitespace is printable
482     assert( isPrintable('1'));
483     assert( isPrintable('a'));
484     assert( isPrintable('#'));
485     assert(!isPrintable('\0')); // control characters are not printable
486 
487     // N.B.: Printable non-ASCII Unicode characters are not recognized.
488     assert(!isPrintable('á'));
489 }
490 
491 @safe unittest
492 {
493     foreach (dchar c; 0 .. 128)
494     {
495         if (isControl(c))
496             assert(!isPrintable(c));
497         else
498             assert(isPrintable(c));
499     }
500 }
501 
502 
503 /++
504     Params: c = The character to test.
505     Returns: Whether or not $(D c) is in the ASCII character set - i.e. in the
506     range 0 .. 0x7F.
507   +/
508 pragma(inline, true)
509 bool isASCII(dchar c) @safe pure nothrow @nogc
510 {
511     return c <= 0x7F;
512 }
513 
514 ///
515 @safe pure nothrow @nogc unittest
516 {
517     assert( isASCII('a'));
518     assert(!isASCII('á'));
519 }
520 
521 @safe unittest
522 {
523     foreach (dchar c; 0 .. 128)
524         assert(isASCII(c));
525 
526     assert(!isASCII(128));
527 }
528 
529 
530 /++
531     Converts an ASCII letter to lowercase.
532 
533     Params: c = A character of any type that implicitly converts to $(D dchar).
534     In the case where it's a built-in type, or an enum of a built-in type,
535     $(D Unqual!(OriginalType!C)) is returned, whereas if it's a user-defined
536     type, $(D dchar) is returned.
537 
538     Returns: The corresponding lowercase letter, if $(D c) is an uppercase
539     ASCII character, otherwise $(D c) itself.
540   +/
541 auto toLower(C)(C c)
542 if (is(C : dchar))
543 {
544     import std.traits : isAggregateType, OriginalType, Unqual;
545 
546     alias OC = OriginalType!C;
547     static if (isAggregateType!OC)
548         alias R = dchar;
549     else
550         alias R = Unqual!OC;
551 
552     return isUpper(c) ? cast(R)(cast(R) c + 'a' - 'A') : cast(R) c;
553 }
554 
555 ///
556 @safe pure nothrow @nogc unittest
557 {
558     assert(toLower('a') == 'a');
559     assert(toLower('A') == 'a');
560     assert(toLower('#') == '#');
561 
562     // N.B.: Non-ASCII Unicode uppercase letters are not converted.
563     assert(toLower('Á') == 'Á');
564 }
565 
566 @safe pure nothrow unittest
567 {
568 
569     foreach (C; AliasSeq!(char, wchar, dchar, immutable char, ubyte))
570     {
571         foreach (i, c; uppercase)
572             assert(toLower(cast(C) c) == lowercase[i]);
573 
574         foreach (C c; 0 .. 128)
575         {
576             if (c < 'A' || c > 'Z')
577                 assert(toLower(c) == c);
578             else
579                 assert(toLower(c) != c);
580         }
581 
582         foreach (C c; 128 .. C.max)
583             assert(toLower(c) == c);
584 
585         //CTFE
586         static assert(toLower(cast(C)'a') == 'a');
587         static assert(toLower(cast(C)'A') == 'a');
588     }
589 }
590 
591 
592 /++
593     Converts an ASCII letter to uppercase.
594 
595     Params: c = Any type which implicitly converts to $(D dchar). In the case
596     where it's a built-in type, or an enum of a built-in type,
597     $(D Unqual!(OriginalType!C)) is returned, whereas if it's a user-defined
598     type, $(D dchar) is returned.
599 
600     Returns: The corresponding uppercase letter, if $(D c) is a lowercase ASCII
601     character, otherwise $(D c) itself.
602   +/
603 auto toUpper(C)(C c)
604 if (is(C : dchar))
605 {
606     import std.traits : isAggregateType, OriginalType, Unqual;
607 
608     alias OC = OriginalType!C;
609     static if (isAggregateType!OC)
610         alias R = dchar;
611     else
612         alias R = Unqual!OC;
613 
614     return isLower(c) ? cast(R)(cast(R) c - ('a' - 'A')) : cast(R) c;
615 }
616 
617 ///
618 @safe pure nothrow @nogc unittest
619 {
620     assert(toUpper('a') == 'A');
621     assert(toUpper('A') == 'A');
622     assert(toUpper('#') == '#');
623 
624     // N.B.: Non-ASCII Unicode lowercase letters are not converted.
625     assert(toUpper('á') == 'á');
626 }
627 
628 @safe pure nothrow unittest
629 {
630     foreach (C; AliasSeq!(char, wchar, dchar, immutable char, ubyte))
631     {
632         foreach (i, c; lowercase)
633             assert(toUpper(cast(C) c) == uppercase[i]);
634 
635         foreach (C c; 0 .. 128)
636         {
637             if (c < 'a' || c > 'z')
638                 assert(toUpper(c) == c);
639             else
640                 assert(toUpper(c) != c);
641         }
642 
643         foreach (C c; 128 .. C.max)
644             assert(toUpper(c) == c);
645 
646         //CTFE
647         static assert(toUpper(cast(C)'a') == 'A');
648         static assert(toUpper(cast(C)'A') == 'A');
649     }
650 }
651 
652 
653 @safe unittest //Test both toUpper and toLower with non-builtin
654 {
655     //User Defined [Char|Wchar|Dchar]
656     static struct UDC {  char c; alias c this; }
657     static struct UDW { wchar c; alias c this; }
658     static struct UDD { dchar c; alias c this; }
659     //[Char|Wchar|Dchar] Enum
660     enum CE :  char {a = 'a', A = 'A'}
661     enum WE : wchar {a = 'a', A = 'A'}
662     enum DE : dchar {a = 'a', A = 'A'}
663     //User Defined [Char|Wchar|Dchar] Enum
664     enum UDCE : UDC {a = UDC('a'), A = UDC('A')}
665     enum UDWE : UDW {a = UDW('a'), A = UDW('A')}
666     enum UDDE : UDD {a = UDD('a'), A = UDD('A')}
667 
668     //User defined types with implicit cast to dchar test.
669     foreach (Char; AliasSeq!(UDC, UDW, UDD))
670     {
671         assert(toLower(Char('a')) == 'a');
672         assert(toLower(Char('A')) == 'a');
673         static assert(toLower(Char('a')) == 'a');
674         static assert(toLower(Char('A')) == 'a');
675         static assert(toUpper(Char('a')) == 'A');
676         static assert(toUpper(Char('A')) == 'A');
677     }
678 
679     //Various enum tests.
680     foreach (Enum; AliasSeq!(CE, WE, DE, UDCE, UDWE, UDDE))
681     {
682         assert(toLower(Enum.a) == 'a');
683         assert(toLower(Enum.A) == 'a');
684         assert(toUpper(Enum.a) == 'A');
685         assert(toUpper(Enum.A) == 'A');
686         static assert(toLower(Enum.a) == 'a');
687         static assert(toLower(Enum.A) == 'a');
688         static assert(toUpper(Enum.a) == 'A');
689         static assert(toUpper(Enum.A) == 'A');
690     }
691 
692     //Return value type tests for enum of non-UDT. These should be the original type.
693     foreach (T; AliasSeq!(CE, WE, DE))
694     {
695         alias C = OriginalType!T;
696         static assert(is(typeof(toLower(T.init)) == C));
697         static assert(is(typeof(toUpper(T.init)) == C));
698     }
699 
700     //Return value tests for UDT and enum of UDT. These should be dchar
701     foreach (T; AliasSeq!(UDC, UDW, UDD, UDCE, UDWE, UDDE))
702     {
703         static assert(is(typeof(toLower(T.init)) == dchar));
704         static assert(is(typeof(toUpper(T.init)) == dchar));
705     }
706 }