1  import re 
  2   
  3   
  4  uri_pattern = r'^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?' 
  5  uri_re = re.compile(uri_pattern) 
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14  uri_illegal_char_re = re.compile( 
 15      "[^-A-Za-z0-9:/?#[\]@!$&'()*+,;=._~%]", re.UNICODE) 
 16   
 17  authority_pattern = r'^([^@]*@)?([^:]*)(:.*)?' 
 18  authority_re = re.compile(authority_pattern) 
 19   
 20   
 21  pct_encoded_pattern = r'%([0-9A-Fa-f]{2})' 
 22  pct_encoded_re = re.compile(pct_encoded_pattern) 
 23   
 24  try: 
 25      unichr(0x10000) 
 26  except ValueError: 
 27       
 28      UCSCHAR = [ 
 29          (0xA0, 0xD7FF), 
 30          (0xF900, 0xFDCF), 
 31          (0xFDF0, 0xFFEF), 
 32          ] 
 33   
 34      IPRIVATE = [ 
 35          (0xE000, 0xF8FF), 
 36          ] 
 37  else: 
 38      UCSCHAR = [ 
 39          (0xA0, 0xD7FF), 
 40          (0xF900, 0xFDCF), 
 41          (0xFDF0, 0xFFEF), 
 42          (0x10000, 0x1FFFD), 
 43          (0x20000, 0x2FFFD), 
 44          (0x30000, 0x3FFFD), 
 45          (0x40000, 0x4FFFD), 
 46          (0x50000, 0x5FFFD), 
 47          (0x60000, 0x6FFFD), 
 48          (0x70000, 0x7FFFD), 
 49          (0x80000, 0x8FFFD), 
 50          (0x90000, 0x9FFFD), 
 51          (0xA0000, 0xAFFFD), 
 52          (0xB0000, 0xBFFFD), 
 53          (0xC0000, 0xCFFFD), 
 54          (0xD0000, 0xDFFFD), 
 55          (0xE1000, 0xEFFFD), 
 56          ] 
 57   
 58      IPRIVATE = [ 
 59          (0xE000, 0xF8FF), 
 60          (0xF0000, 0xFFFFD), 
 61          (0x100000, 0x10FFFD), 
 62          ] 
 63   
 64   
 65  _unreserved = [False] * 256 
 66  for _ in range(ord('A'), ord('Z') + 1): _unreserved[_] = True 
 67  for _ in range(ord('0'), ord('9') + 1): _unreserved[_] = True 
 68  for _ in range(ord('a'), ord('z') + 1): _unreserved[_] = True 
 69  _unreserved[ord('-')] = True 
 70  _unreserved[ord('.')] = True 
 71  _unreserved[ord('_')] = True 
 72  _unreserved[ord('~')] = True 
 73   
 74   
 75  _escapeme_re = re.compile('[%s]' % (''.join( 
 76      map(lambda (m, n): u'%s-%s' % (unichr(m), unichr(n)), 
 77          UCSCHAR + IPRIVATE)),)) 
 78   
 79   
 81      c = char_match.group() 
 82      return ''.join(['%%%X' % (ord(octet),) for octet in c.encode('utf-8')]) 
  83   
 84   
 86      try: 
 87          i = int(mo.group(1), 16) 
 88          if _unreserved[i]: 
 89              return chr(i) 
 90          else: 
 91              return mo.group().upper() 
 92   
 93      except ValueError: 
 94          return mo.group() 
  95   
 96   
 98      try: 
 99          return chr(int(mo.group(1), 16)) 
100      except ValueError: 
101          return mo.group() 
 102   
103   
105      result_segments = [] 
106   
107      while path: 
108          if path.startswith('../'): 
109              path = path[3:] 
110          elif path.startswith('./'): 
111              path = path[2:] 
112          elif path.startswith('/./'): 
113              path = path[2:] 
114          elif path == '/.': 
115              path = '/' 
116          elif path.startswith('/../'): 
117              path = path[3:] 
118              if result_segments: 
119                  result_segments.pop() 
120          elif path == '/..': 
121              path = '/' 
122              if result_segments: 
123                  result_segments.pop() 
124          elif path == '..' or path == '.': 
125              path = '' 
126          else: 
127              i = 0 
128              if path[0] == '/': 
129                  i = 1 
130              i = path.find('/', i) 
131              if i == -1: 
132                  i = len(path) 
133              result_segments.append(path[:i]) 
134              path = path[i:] 
135   
136      return ''.join(result_segments) 
 137   
138   
140      if isinstance(uri, unicode): 
141          uri = _escapeme_re.sub(_pct_escape_unicode, uri).encode('ascii') 
142   
143      illegal_mo = uri_illegal_char_re.search(uri) 
144      if illegal_mo: 
145          raise ValueError('Illegal characters in URI: %r at position %s' % 
146                           (illegal_mo.group(), illegal_mo.start())) 
147   
148      uri_mo = uri_re.match(uri) 
149   
150      scheme = uri_mo.group(2) 
151      if scheme is None: 
152          raise ValueError('No scheme specified') 
153   
154      scheme = scheme.lower() 
155      if scheme not in ('http', 'https'): 
156          raise ValueError('Not an absolute HTTP or HTTPS URI: %r' % (uri,)) 
157   
158      authority = uri_mo.group(4) 
159      if authority is None: 
160          raise ValueError('Not an absolute URI: %r' % (uri,)) 
161   
162      authority_mo = authority_re.match(authority) 
163      if authority_mo is None: 
164          raise ValueError('URI does not have a valid authority: %r' % (uri,)) 
165   
166      userinfo, host, port = authority_mo.groups() 
167   
168      if userinfo is None: 
169          userinfo = '' 
170   
171      if '%' in host: 
172          host = host.lower() 
173          host = pct_encoded_re.sub(_pct_encoded_replace, host) 
174          host = unicode(host, 'utf-8').encode('idna') 
175      else: 
176          host = host.lower() 
177   
178      if port: 
179          if (port == ':' or 
180              (scheme == 'http' and port == ':80') or 
181              (scheme == 'https' and port == ':443')): 
182              port = '' 
183      else: 
184          port = '' 
185   
186      authority = userinfo + host + port 
187   
188      path = uri_mo.group(5) 
189      path = pct_encoded_re.sub(_pct_encoded_replace_unreserved, path) 
190      path = remove_dot_segments(path) 
191      if not path: 
192          path = '/' 
193   
194      query = uri_mo.group(6) 
195      if query is None: 
196          query = '' 
197   
198      fragment = uri_mo.group(8) 
199      if fragment is None: 
200          fragment = '' 
201   
202      return scheme + '://' + authority + path + query + fragment 
 203