1 module gfm.net.uri;
2 
3 import std.range,
4        std.string,
5        std.ascii,
6        std.socket;
7 
8 
9 
10 /// Exception thrown when an URI doesn't parse.
11 class URIException : Exception
12 {
13     public
14     {
15         @safe pure nothrow this(string message, string file =__FILE__, size_t line = __LINE__, Throwable next = null)
16         {
17             super(message, file, line, next);
18         }
19     }
20 }
21 
22 /**
23  
24    An attempt at implementing URI (RFC 3986).
25  
26   All constructed URI are valid and normalized.
27   Bugs: 
28   $(UL 
29     $(LI Separate segments in parsed form.)
30     $(LI Relative URL combining.)
31     $(LI . and .. normalization.)
32   )
33 
34   Alternative:
35     Consider using $(WEB vibed.org,vibe.d) if you need something better.
36  */
37 class URI
38 {
39     public
40     {
41         enum HostType
42         {
43             NONE,
44             REG_NAME, /// Host has a registered name.
45             IPV4,     /// Host has an IPv4
46             IPV6,     /// Host has an IPv6
47             IPVFUTURE /// Unknown yet scheme.
48         }
49 
50         /// Creactes an URI from an input range, throws if invalid.
51         /// Input should be an ENCODED url range.
52         /// Throws: $(D URIException) if the URI is invalid.
53         this(T)(T input) if (isForwardRange!T)
54         {
55             _scheme = null;
56             _hostType = HostType.NONE;
57             _hostName = null;            
58             _port = -1;
59             _userInfo = null;
60             _path = null;
61             _query = null;
62             _fragment = null;
63             parseURI(input);
64         }
65 
66         /// Checks URI validity.
67         /// Returns: true if input is valid.
68         static bool isValid(T)(T input) /* pure */ nothrow
69         {
70             try
71             {
72                 try
73                 {
74                     URI uri = new URI(input); 
75                     return true;
76                 }
77                 catch (URIException e)
78                 {
79                     return false;
80                 }
81             }
82             catch (Exception e)
83             {
84                 assert(false); // came here? Fix the library by writing the missing catch-case.
85             }
86         }
87 
88         // getters for normalized URI components
89 
90         /// Returns: URI scheme, guaranteed not null.
91         string scheme() pure const nothrow
92         {
93             return _scheme;
94         }
95 
96         /// Returns: Host name, or null if not available.
97         string hostName() pure const nothrow
98         {
99             return _hostName;
100         }
101 
102         /// Returns: Host type (HostType.NONE if not available).
103         HostType hostType() pure const nothrow
104         {
105             return _hostType;
106         }
107 
108         /** 
109          * Returns: port number. 
110          * If none is provided by the URI, return the default port for this scheme.
111          * If the scheme isn't recognized, return -1.
112          */
113         int port() pure const nothrow
114         {
115             if (_port != -1)
116                 return _port;
117 
118             foreach (ref e; knownSchemes)
119                 if (e.scheme == _scheme)
120                     return e.defaultPort;
121 
122             return -1;
123         }
124 
125         /// Returns: User-info part of the URI, or null if not available.
126         string userInfo() pure const nothrow
127         {
128             return _userInfo;
129         }
130 
131         /// Returns: Path part of the URI, never null, can be the empty string.
132         string path() pure const nothrow
133         {
134             return _path;
135         }
136 
137         /// Returns: Query part of the URI, or null if not available.
138         string query() pure const nothrow
139         {
140             return _query;
141         }
142 
143         /// Returns: Fragment part of the URI, or null if not available.
144         string fragment() pure const nothrow
145         {
146             return _fragment;
147         }
148 
149         /// Returns: Authority part of the URI.
150         string authority() pure const nothrow
151         {
152             if (_hostName is null)
153                 return null;
154 
155             string res = "";
156             if (_userInfo !is null)
157                 res = res ~ _userInfo ~ "@"; 
158             res ~= _hostName;
159             if (_port != -1)
160                 res = res ~ ":" ~ itos(_port);
161             return res;
162         }
163 
164         /// Resolves URI host name.
165         /// Returns: std.socket.Address from the URI.
166         Address resolveAddress()
167         {
168             final switch(_hostType)
169             {
170                 case HostType.REG_NAME:
171                 case HostType.IPV4:
172                     return new InternetAddress(_hostName, cast(ushort)port());
173 
174                 case HostType.IPV6:
175                     return new Internet6Address(_hostName, cast(ushort)port());
176 
177                 case HostType.IPVFUTURE:
178                 case HostType.NONE:
179                     throw new URIException("Cannot resolve such host");
180             }
181         }
182 
183         /// Returns: Pretty string representation.
184         override string toString() const
185         {
186             string res = _scheme ~ ":";
187 
188             if (_hostName is null)            
189                 res = res ~ "//" ~ authority();
190             res ~= _path;
191             if (_query !is null)
192                 res = res ~ "?" ~ _query;
193             if (_fragment !is null)
194                 res = res ~ "#" ~ _fragment;
195             return res;
196         }
197 
198         /// Semantic comparison of two URIs.
199         /// They are equals if they have the same normalized string representation.
200         bool opEquals(U)(U other) pure const nothrow if (is(U : FixedPoint))
201         {
202             return value == other.value;
203         }
204     }
205 
206     private
207     {
208         // normalized URI components
209         string _scheme;     // never null, never empty
210         string _userInfo;   // can be null
211         HostType _hostType; // what the hostname string is (NONE if no host in URI)
212         string _hostName;   // null if no authority in URI
213         int _port;          // -1 if no port in URI
214         string _path;       // never null, bu could be empty
215         string _query;      // can be null
216         string _fragment;   // can be null
217 
218         // URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
219         void parseURI(T)(ref T input)
220         {
221             _scheme = toLower(parseScheme(input));
222             consume(input, ':');
223             parseHierPart(input);
224 
225             if (input.empty)
226                 return;
227 
228             char c = popChar(input);
229 
230             if (c == '?')
231             {
232                 _query = parseQuery(input);
233 
234                 if (input.empty)
235                     return;
236 
237                 c = popChar(input);
238             }
239 
240             if (c == '#')
241             {
242                 _fragment = parseFragment(input);
243             }
244 
245             if (!input.empty)
246                 throw new URIException("unexpected characters at end of URI");
247         }
248 
249         string parseScheme(T)(ref T input)
250         {
251             string result = "";
252             char c = popChar(input);
253             if (!isAlpha(c))
254                 throw new URIException("expected alpha character in URI scheme");
255 
256             result ~= c;
257 
258             while(!input.empty)
259             {
260                 c = peekChar(input);
261 
262                 if (isAlpha(c) || isDigit(c) || "+-.".contains(c))
263                 {
264                     result ~= c;
265                     input.popFront();
266                 }
267                 else
268                     break;
269             }
270             return result;
271         }
272 
273         // hier-part   = "//" authority path-abempty
274         //             / path-absolute
275         //             / path-rootless
276         //             / path-empty
277         void parseHierPart(T)(ref T input)
278         {
279             if (input.empty())
280                 return; // path-empty
281 
282             char c = peekChar(input);
283             if (c == '/')
284             {
285                 input.popFront();
286                 T sinput = input.save;
287                 if (!input.empty() && peekChar(input) == '/')
288                 {
289                     consume(input, '/');
290                     parseAuthority(input);
291                     _path = parseAbEmpty(input);
292                 }
293                 else
294                 {
295                     input = sinput.save;
296                     _path = parsePathAbsolute(input);
297                 }
298             }
299             else
300             {
301                 _path = parsePathRootless(input);
302             }
303         }
304 
305         // authority   = [ userinfo "@" ] host [ ":" port ]
306         void parseAuthority(T)(ref T input)
307         {
308             // trying to parse user
309             T uinput = input.save;
310             try
311             {
312                 _userInfo = parseUserinfo(input);
313                 consume(input, '@');
314             }
315             catch(URIException e)
316             {
317                 // no user name in URI
318                 _userInfo = null;
319                 input = uinput.save;
320             }
321 
322             parseHost(input, _hostName, _hostType);
323 
324             if (!empty(input) && peekChar(input) == ':')
325             {
326                 consume(input, ':');
327                 _port = parsePort(input);
328             }
329         }
330 
331         string parsePcharString(T)(ref T input, bool allowColon, bool allowAt, bool allowSlashQuestionMark)
332         {
333             string res = "";
334 
335             while(!input.empty)
336             {
337                 char c = peekChar(input);
338 
339                 if (isUnreserved(c) || isSubDelim(c))
340                     res ~= popChar(input);
341                 else if (c == '%')
342                     res ~= parsePercentEncodedChar(input);
343                 else if (c == ':' && allowColon)
344                     res ~= popChar(input);
345                 else if (c == '@' && allowAt)
346                     res ~= popChar(input);
347                 else if ((c == '?' || c == '/') && allowSlashQuestionMark)
348                     res ~= popChar(input);
349                 else
350                     break;
351             }
352             return res;
353         }
354 
355         
356         void parseHost(T)(ref T input, out string res, out HostType hostType)
357         {
358             char c = peekChar(input);
359             if (c == '[')
360                 parseIPLiteral(input, res, hostType);
361             else
362             {
363                 T iinput = input.save;
364                 try
365                 {
366                     hostType = HostType.IPV4;
367                     res = parseIPv4Address(input);
368                 }
369                 catch (URIException e)
370                 {
371                     input = iinput.save;
372                     hostType = HostType.REG_NAME;
373                     res = toLower(parseRegName(input));
374                 }
375             }
376         }
377 
378         void parseIPLiteral(T)(ref T input, out string res, out HostType hostType)
379         {
380             consume(input, '[');
381             if (peekChar(input) == 'v')
382             {
383                 hostType = HostType.IPVFUTURE;
384                 res = parseIPv6OrFutureAddress(input);
385             }
386             else
387             {
388                 hostType = HostType.IPV6;
389                 string ipv6 = parseIPv6OrFutureAddress(input);
390 
391                 // validate and expand IPv6 (for normalizaton to be effective for comparisons)
392                 try
393                 {
394                     ubyte[16] bytes = Internet6Address.parse(ipv6);
395                     res = "";
396                     foreach (i ; 0..16)
397                     {
398                         if ((i & 1) == 0 && i != 0) 
399                             res ~= ":";
400                         res ~= format("%02x", bytes[i]);
401                     }
402                 }
403                 catch(SocketException e)
404                 {
405                     // IPv6 address did not parse
406                     throw new URIException(e.msg);
407                 }
408             }
409             consume(input, ']');
410         }
411 
412         string parseIPv6OrFutureAddress(T)(ref T input)
413         {
414             string res = "";
415             while (peekChar(input) != ']')
416                 res ~= popChar(input);
417             return res;
418         }
419 
420         string parseIPv4Address(T)(ref T input)
421         {
422             int a = parseDecOctet(input);
423             consume(input, '.');
424             int b = parseDecOctet(input);
425             consume(input, '.');
426             int c = parseDecOctet(input);
427             consume(input, '.');
428             int d = parseDecOctet(input);
429             return format("%s.%s.%s.%s", a, b, c, d);
430         }        
431 
432         // dec-octet     = DIGIT                 ; 0-9
433         //               / %x31-39 DIGIT         ; 10-99
434         //               / "1" 2DIGIT            ; 100-199
435         //               / "2" %x30-34 DIGIT     ; 200-249
436         //               / "25" %x30-35          ; 250-255
437         int parseDecOctet(T)(ref T input)
438         {
439             int res = popDigit(input);
440 
441             if (!input.empty && isDigit(peekChar(input)))
442             {
443                 res = 10 * res + popDigit(input);
444 
445                 if (!input.empty && isDigit(peekChar(input)))
446                     res = 10 * res + popDigit(input);
447             }
448 
449             if (res > 255)
450                 throw new URIException("out of range number in IPv4 address");
451 
452             return res;
453         }
454 
455         // query         = *( pchar / "/" / "?" )
456         string parseQuery(T)(ref T input)
457         {
458             return parsePcharString(input, true, true, true);
459         }
460 
461         // fragment      = *( pchar / "/" / "?" )
462         string parseFragment(T)(ref T input)
463         {
464             return parsePcharString(input, true, true, true);
465         }
466 
467         // pct-encoded   = "%" HEXDIG HEXDIG
468         char parsePercentEncodedChar(T)(ref T input)
469         {
470             consume(input, '%');
471 
472             int char1Val = hexValue(popChar(input));
473             int char2Val = hexValue(popChar(input));
474             return cast(char)(char1Val * 16 + char2Val);
475         }
476 
477         // userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
478         string parseUserinfo(T)(ref T input)
479         {
480             return parsePcharString(input, true, false, false);
481         }
482 
483         // reg-name      = *( unreserved / pct-encoded / sub-delims )
484         string parseRegName(T)(ref T input)
485         {
486             return parsePcharString(input, false, false, false);
487         }
488 
489         // port          = *DIGIT
490         int parsePort(T)(ref T input)
491         {
492             int res = 0;
493 
494             while(!input.empty)
495             {
496                 char c = peekChar(input);
497                 if (!isDigit(c))
498                     break;
499                 res = res * 10 + popDigit(input);
500             }
501             return res;
502         }
503 
504         // segment       = *pchar
505         // segment-nz    = 1*pchar
506         // segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
507         string parseSegment(T)(ref T input, bool allowZero, bool allowColon)
508         {
509             string res = parsePcharString(input, allowColon, true, false);
510             if (!allowZero && res == "")
511                 throw new URIException("expected a non-zero segment in URI");
512             return res;
513         }
514 
515         // path-abempty  = *( "/" segment )
516         string parseAbEmpty(T)(ref T input)
517         {
518             string res = "";
519             while (!input.empty)
520             {
521                 if (peekChar(input) != '/')
522                     break;
523                 consume(input, '/');
524                 res = res ~ "/" ~ parseSegment(input, true, true);
525             }
526             return res;
527         }
528 
529         // path-absolute = "/" [ segment-nz *( "/" segment ) ]
530         string parsePathAbsolute(T)(ref T input)
531         {
532             consume(input, '/');
533             string res = "/";
534 
535             try
536             {
537                 res ~= parseSegment(input, false, true);
538             }
539             catch(URIException e)
540             {
541                 return res;
542             }
543 
544             res ~= parseAbEmpty(input);
545             return res;
546         }
547 
548         string parsePathNoSlash(T)(ref T input, bool allowColonInFirstSegment)
549         {
550             string res = parseSegment(input, false, allowColonInFirstSegment);
551             res ~= parseAbEmpty(input);
552             return res;
553         }
554 
555         // path-noscheme = segment-nz-nc *( "/" segment )
556         string parsePathNoScheme(T)(ref T input)
557         {
558             return parsePathNoSlash(input, false);
559         }
560 
561         // path-rootless = segment-nz *( "/" segment )
562         string parsePathRootless(T)(ref T input)
563         {
564             return parsePathNoSlash(input, true);
565         }
566     }
567 }
568 
569 private pure
570 {
571     bool contains(string s, char c) nothrow
572     {
573         foreach(char sc; s)
574           if (c == sc)
575             return true;
576         return false;
577     }
578 
579     bool isAlpha(char c) nothrow
580     {
581         return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
582     }
583 
584     bool isDigit(char c) nothrow
585     {
586         return c >= '0' && c <= '9';
587     }
588 
589     bool isHexDigit(char c) nothrow
590     {
591         return hexValue(c) != -1;
592     }
593 
594     bool isUnreserved(char c) nothrow
595     {
596         return isAlpha(c) || isDigit(c) || "-._~".contains(c);
597     }
598 
599     bool isReserved(char c) nothrow
600     {
601         return isGenDelim(c) || isSubDelim(c);
602     }
603 
604     bool isGenDelim(char c) nothrow
605     {
606         return ":/?#[]@".contains(c);
607     }
608 
609     bool isSubDelim(char c) nothrow
610     {
611         return "!$&'()*+,;=".contains(c);
612     }
613 
614     int hexValue(char c) nothrow
615     {
616         if (isDigit(c))
617             return c - '0';
618         else if (c >= 'a' && c <= 'f')
619             return c - 'a';
620         else if (c >= 'A' && c <= 'F')
621             return c - 'A';
622         else
623             return -1;
624     }
625 
626     // peek char from input range, or throw
627     char peekChar(T)(ref T input)
628     {
629         if (input.empty())
630             throw new URIException("expected character");
631 
632         dchar c = input.front;
633 
634         if (cast(int)c >= 127)
635             throw new URIException("US-ASCII character expected");
636 
637         return cast(char)c;
638     }
639 
640     // pop char from input range, or throw
641     char popChar(T)(ref T input)
642     {
643         char result = peekChar(input);
644         input.popFront();
645         return result;
646     }
647 
648     int popDigit(T)(ref T input)
649     {
650         char c = popChar(input);
651         if (!isDigit(c))
652             throw new URIException("expected digit character");
653         return hexValue(c);
654     }
655 
656     void consume(T)(ref T input, char expected)
657     {
658         char c = popChar(input);
659         if (c != expected)
660             throw new URIException("expected '" ~ c ~ "' character");
661     }
662 
663     string itos(int i) pure nothrow
664     {
665         string res = "";
666         do
667         {
668             res = ('0' + (i % 10)) ~ res;
669             i = i / 10;
670         } while (i != 0);
671         return res;
672     }
673 
674     struct KnownScheme
675     {
676         string scheme;
677         int defaultPort;
678     }
679 
680     enum knownSchemes =
681     [
682         KnownScheme("ftp", 21),
683         KnownScheme("sftp", 22),
684         KnownScheme("telnet", 23),
685         KnownScheme("smtp", 25),
686         KnownScheme("gopher", 70),
687         KnownScheme("http", 80),
688         KnownScheme("nntp", 119),
689         KnownScheme("https", 443)
690     ];
691 
692 }
693 
694 unittest
695 {
696     
697     {
698         string s = "HTTP://machin@fr.wikipedia.org:80/wiki/Uniform_Resource_Locator?Query%20Part=4#fragment%20part";
699         assert(URI.isValid(s));
700         auto uri = new URI(s);
701         assert(uri.scheme() == "http");
702         assert(uri.userInfo() == "machin");
703         assert(uri.hostName() == "fr.wikipedia.org");
704         assert(uri.port() == 80);
705         assert(uri.authority() == "machin@fr.wikipedia.org:80");
706         assert(uri.path() == "/wiki/Uniform_Resource_Locator");
707         assert(uri.query() == "Query Part=4");
708         assert(uri.fragment() == "fragment part");
709     }
710 
711     // host tests
712     {
713         assert((new URI("http://truc.org")).hostType() == URI.HostType.REG_NAME);
714         assert((new URI("http://127.0.0.1")).hostType() == URI.HostType.IPV4);
715         assert((new URI("http://[2001:db8::7]")).hostType() == URI.HostType.IPV6);
716         assert((new URI("http://[v9CrazySchemeFromOver9000year]")).hostType() == URI.HostType.IPVFUTURE);
717     }
718 
719     auto wellFormedURIs =
720     [
721         "ftp://ftp.rfc-editor.org/in-notes/rfc2396.txt",
722         "mailto:Quidam.no-spam@example.com",
723         "news:fr.comp.infosystemes.www.auteurs",
724         "gopher://gopher.quux.org/",
725         "http://Jojo:lApIn@www.example.com:8888/chemin/d/acc%C3%A8s.php?q=req&q2=req2#signet",
726         "ldap://[2001:db8::7]/c=GB?objectClass?one",
727         "mailto:John.Doe@example.com",
728         "tel:+1-816-555-1212",
729         "telnet://192.0.2.16:80/",
730         "urn:oasis:names:specification:docbook:dtd:xml:4.1.2",
731         "about:",
732     ];
733 
734     foreach (wuri; wellFormedURIs)
735     {
736         bool valid = URI.isValid(wuri);
737         assert(valid);
738     }
739 }