openid.fetchers

1 # -*- test-case-name: openid.test.test_fetchers -*- 2 """ 3 This module contains the HTTP fetcher interface and several implementations. 4 """ 5 6 __all__ = ['fetch', 'getDefaultFetcher', 'setDefaultFetcher', 'HTTPResponse', 7 'HTTPFetcher', 'createHTTPFetcher', 'HTTPFetchingError', 8 'HTTPError'] 9 10 import urllib2 11 import time 12 import cStringIO 13 import sys 14 15 import openid 16 import openid.urinorm 17 18 # Try to import httplib2 for caching support 19 # http://bitworking.org/projects/httplib2/ 20 try: 21 import httplib2 22 except ImportError: 23 # httplib2 not available 24 httplib2 = None 25 26 # try to import pycurl, which will let us use CurlHTTPFetcher 27 try: 28 import pycurl 29 except ImportError: 30 pycurl = None 31 32 USER_AGENT = "python-openid/%s (%s)" % (openid.__version__, sys.platform) 33 MAX_RESPONSE_KB = 1024 34

35 -def fetch(url, body=None, headers=None):

36 """Invoke the fetch method on the default fetcher. Most users 37 should need only this method. 38 39 @raises Exception: any exceptions that may be raised by the default fetcher 40 """ 41 fetcher = getDefaultFetcher() 42 return fetcher.fetch(url, body, headers)

43

44 -def createHTTPFetcher():

45 """Create a default HTTP fetcher instance 46 47 prefers Curl to urllib2.""" 48 if pycurl is None: 49 fetcher = Urllib2Fetcher() 50 else: 51 fetcher = CurlHTTPFetcher() 52 53 return fetcher

54 55 # Contains the currently set HTTP fetcher. If it is set to None, the 56 # library will call createHTTPFetcher() to set it. Do not access this 57 # variable outside of this module. 58 _default_fetcher = None 59

60 -def getDefaultFetcher():

61 """Return the default fetcher instance 62 if no fetcher has been set, it will create a default fetcher. 63 64 @return: the default fetcher 65 @rtype: HTTPFetcher 66 """ 67 global _default_fetcher 68 69 if _default_fetcher is None: 70 setDefaultFetcher(createHTTPFetcher()) 71 72 return _default_fetcher

73

74 -def setDefaultFetcher(fetcher, wrap_exceptions=True):

75 """Set the default fetcher 76 77 @param fetcher: The fetcher to use as the default HTTP fetcher 78 @type fetcher: HTTPFetcher 79 80 @param wrap_exceptions: Whether to wrap exceptions thrown by the 81 fetcher wil HTTPFetchingError so that they may be caught 82 easier. By default, exceptions will be wrapped. In general, 83 unwrapped fetchers are useful for debugging of fetching errors 84 or if your fetcher raises well-known exceptions that you would 85 like to catch. 86 @type wrap_exceptions: bool 87 """ 88 global _default_fetcher 89 if fetcher is None or not wrap_exceptions: 90 _default_fetcher = fetcher 91 else: 92 _default_fetcher = ExceptionWrappingFetcher(fetcher)

93

94 -def usingCurl():

95 """Whether the currently set HTTP fetcher is a Curl HTTP fetcher.""" 96 return isinstance(getDefaultFetcher(), CurlHTTPFetcher)

97

98 -class HTTPResponse(object):

99 """XXX document attributes""" 100 headers = None 101 status = None 102 body = None 103 final_url = None 104

105 - def __init__(self, final_url=None, status=None, headers=None, body=None):

106 self.final_url = final_url 107 self.status = status 108 self.headers = headers 109 self.body = body

110

111 - def __repr__(self):

112 return "<%s status %s for %s>" % (self.__class__.__name__, 113 self.status, 114 self.final_url)

115

116 -class HTTPFetcher(object):

117 """ 118 This class is the interface for openid HTTP fetchers. This 119 interface is only important if you need to write a new fetcher for 120 some reason. 121 """ 122

123 - def fetch(self, url, body=None, headers=None):

124 """ 125 This performs an HTTP POST or GET, following redirects along 126 the way. If a body is specified, then the request will be a 127 POST. Otherwise, it will be a GET. 128 129 130 @param headers: HTTP headers to include with the request 131 @type headers: {str:str} 132 133 @return: An object representing the server's HTTP response. If 134 there are network or protocol errors, an exception will be 135 raised. HTTP error responses, like 404 or 500, do not 136 cause exceptions. 137 138 @rtype: L{HTTPResponse} 139 140 @raise Exception: Different implementations will raise 141 different errors based on the underlying HTTP library. 142 """ 143 raise NotImplementedError

144

145 -def _allowedURL(url):

146 return url.startswith('http://') or url.startswith('https://')

147

148 -class HTTPFetchingError(Exception):

149 """Exception that is wrapped around all exceptions that are raised 150 by the underlying fetcher when using the ExceptionWrappingFetcher 151 152 @ivar why: The exception that caused this exception 153 """

154 - def __init__(self, why=None):

155 Exception.__init__(self, why) 156 self.why = why

157

158 -class ExceptionWrappingFetcher(HTTPFetcher):

159 """Fetcher that wraps another fetcher, causing all exceptions 160 161 @cvar uncaught_exceptions: Exceptions that should be exposed to the 162 user if they are raised by the fetch call 163 """ 164 165 uncaught_exceptions = (SystemExit, KeyboardInterrupt, MemoryError) 166

167 - def __init__(self, fetcher):

168 self.fetcher = fetcher

169

170 - def fetch(self, *args, **kwargs):

171 try: 172 return self.fetcher.fetch(*args, **kwargs) 173 except self.uncaught_exceptions: 174 raise 175 except: 176 exc_cls, exc_inst = sys.exc_info()[:2] 177 if exc_inst is None: 178 # string exceptions 179 exc_inst = exc_cls 180 181 raise HTTPFetchingError(why=exc_inst)

182

183 -class Urllib2Fetcher(HTTPFetcher):

184 """An C{L{HTTPFetcher}} that uses urllib2. 185 """ 186 187 # Parameterized for the benefit of testing frameworks, see 188 # http://trac.openidenabled.com/trac/ticket/85 189 urlopen = staticmethod(urllib2.urlopen) 190

191 - def fetch(self, url, body=None, headers=None):

192 if not _allowedURL(url): 193 raise ValueError('Bad URL scheme: %r' % (url,)) 194 195 if headers is None: 196 headers = {} 197 198 headers.setdefault( 199 'User-Agent', 200 "%s Python-urllib/%s" % (USER_AGENT, urllib2.__version__,)) 201 202 req = urllib2.Request(url, data=body, headers=headers) 203 try: 204 f = self.urlopen(req) 205 try: 206 return self._makeResponse(f) 207 finally: 208 f.close() 209 except urllib2.HTTPError, why: 210 try: 211 return self._makeResponse(why) 212 finally: 213 why.close()

214

215 - def _makeResponse(self, urllib2_response):

216 resp = HTTPResponse() 217 resp.body = urllib2_response.read(MAX_RESPONSE_KB * 1024) 218 resp.final_url = urllib2_response.geturl() 219 resp.headers = dict(urllib2_response.info().items()) 220 221 if hasattr(urllib2_response, 'code'): 222 resp.status = urllib2_response.code 223 else: 224 resp.status = 200 225 226 return resp

227

228 -class HTTPError(HTTPFetchingError):

229 """ 230 This exception is raised by the C{L{CurlHTTPFetcher}} when it 231 encounters an exceptional situation fetching a URL. 232 """ 233 pass

234 235 # XXX: define what we mean by paranoid, and make sure it is.

236 -class CurlHTTPFetcher(HTTPFetcher):

237 """ 238 An C{L{HTTPFetcher}} that uses pycurl for fetching. 239 See U{http://pycurl.sourceforge.net/}. 240 """ 241 ALLOWED_TIME = 20 # seconds 242

243 - def __init__(self):

244 HTTPFetcher.__init__(self) 245 if pycurl is None: 246 raise RuntimeError('Cannot find pycurl library')

247

248 - def _parseHeaders(self, header_file):

249 header_file.seek(0) 250 251 # Remove the status line from the beginning of the input 252 unused_http_status_line = header_file.readline().lower () 253 if unused_http_status_line.startswith('http/1.1 100 '): 254 unused_http_status_line = header_file.readline() 255 unused_http_status_line = header_file.readline() 256 257 lines = [line.strip() for line in header_file] 258 259 # and the blank line from the end 260 empty_line = lines.pop() 261 if empty_line: 262 raise HTTPError("No blank line at end of headers: %r" % (line,)) 263 264 headers = {} 265 for line in lines: 266 try: 267 name, value = line.split(':', 1) 268 except ValueError: 269 raise HTTPError( 270 "Malformed HTTP header line in response: %r" % (line,)) 271 272 value = value.strip() 273 274 # HTTP headers are case-insensitive 275 name = name.lower() 276 headers[name] = value 277 278 return headers

279

280 - def _checkURL(self, url):

281 # XXX: document that this can be overridden to match desired policy 282 # XXX: make sure url is well-formed and routeable 283 return _allowedURL(url)

284

285 - def fetch(self, url, body=None, headers=None):

286 stop = int(time.time()) + self.ALLOWED_TIME 287 off = self.ALLOWED_TIME 288 289 if headers is None: 290 headers = {} 291 292 headers.setdefault('User-Agent', 293 "%s %s" % (USER_AGENT, pycurl.version,)) 294 295 header_list = [] 296 if headers is not None: 297 for header_name, header_value in headers.iteritems(): 298 header_list.append('%s: %s' % (header_name, header_value)) 299 300 c = pycurl.Curl() 301 try: 302 c.setopt(pycurl.NOSIGNAL, 1) 303 304 if header_list: 305 c.setopt(pycurl.HTTPHEADER, header_list) 306 307 # Presence of a body indicates that we should do a POST 308 if body is not None: 309 c.setopt(pycurl.POST, 1) 310 c.setopt(pycurl.POSTFIELDS, body) 311 312 while off > 0: 313 if not self._checkURL(url): 314 raise HTTPError("Fetching URL not allowed: %r" % (url,)) 315 316 data = cStringIO.StringIO() 317 def write_data(chunk): 318 if data.tell() > 1024*MAX_RESPONSE_KB: 319 return 0 320 else: 321 return data.write(chunk)

322 323 response_header_data = cStringIO.StringIO() 324 c.setopt(pycurl.WRITEFUNCTION, write_data) 325 c.setopt(pycurl.HEADERFUNCTION, response_header_data.write) 326 c.setopt(pycurl.TIMEOUT, off) 327 c.setopt(pycurl.URL, openid.urinorm.urinorm(url)) 328 329 c.perform() 330 331 response_headers = self._parseHeaders(response_header_data) 332 code = c.getinfo(pycurl.RESPONSE_CODE) 333 if code in [301, 302, 303, 307]: 334 url = response_headers.get('location') 335 if url is None: 336 raise HTTPError( 337 'Redirect (%s) returned without a location' % code) 338 339 # Redirects are always GETs 340 c.setopt(pycurl.POST, 0) 341 342 # There is no way to reset POSTFIELDS to empty and 343 # reuse the connection, but we only use it once. 344 else: 345 resp = HTTPResponse() 346 resp.headers = response_headers 347 resp.status = code 348 resp.final_url = url 349 resp.body = data.getvalue() 350 return resp 351 352 off = stop - int(time.time()) 353 354 raise HTTPError("Timed out fetching: %r" % (url,)) 355 finally: 356 c.close()

357

358 -class HTTPLib2Fetcher(HTTPFetcher):

359 """A fetcher that uses C{httplib2} for performing HTTP 360 requests. This implementation supports HTTP caching. 361 362 @see: http://bitworking.org/projects/httplib2/ 363 """ 364

365 - def __init__(self, cache=None):

366 """@param cache: An object suitable for use as an C{httplib2} 367 cache. If a string is passed, it is assumed to be a 368 directory name. 369 """ 370 if httplib2 is None: 371 raise RuntimeError('Cannot find httplib2 library. ' 372 'See http://bitworking.org/projects/httplib2/') 373 374 super(HTTPLib2Fetcher, self).__init__() 375 376 # An instance of the httplib2 object that performs HTTP requests 377 self.httplib2 = httplib2.Http(cache) 378 379 # We want httplib2 to raise exceptions for errors, just like 380 # the other fetchers. 381 self.httplib2.force_exception_to_status_code = False

382

383 - def fetch(self, url, body=None, headers=None):

384 """Perform an HTTP request 385 386 @raises Exception: Any exception that can be raised by httplib2 387 388 @see: C{L{HTTPFetcher.fetch}} 389 """ 390 if body: 391 method = 'POST' 392 else: 393 method = 'GET' 394 395 if headers is None: 396 headers = {} 397 398 # httplib2 doesn't check to make sure that the URL's scheme is 399 # 'http' so we do it here. 400 if not (url.startswith('http://') or url.startswith('https://')): 401 raise ValueError('URL is not a HTTP URL: %r' % (url,)) 402 403 httplib2_response, content = self.httplib2.request( 404 url, method, body=body, headers=headers) 405 406 # Translate the httplib2 response to our HTTP response abstraction 407 408 # When a 400 is returned, there is no "content-location" 409 # header set. This seems like a bug to me. I can't think of a 410 # case where we really care about the final URL when it is an 411 # error response, but being careful about it can't hurt. 412 try: 413 final_url = httplib2_response['content-location'] 414 except KeyError: 415 # We're assuming that no redirects occurred 416 assert not httplib2_response.previous 417 418 # And this should never happen for a successful response 419 assert httplib2_response.status != 200 420 final_url = url 421 422 return HTTPResponse( 423 body=content, 424 final_url=final_url, 425 headers=dict(httplib2_response.items()), 426 status=httplib2_response.status, 427 )

428

Source Code for Module openid.fetchers