updated htmlifier for safety and features

ensure that path of the file we're going to open is actually
inside our log root. If not return a 400.

escape dates in the regex so they are easier to pass around,
unencoded spaces kind of suck for that.

catch the possible IOError of openning a file that doesn't exist,
be nice and give people a 404 in that case.

move the content negotiation into a function, for readability

Change-Id: I334e1ac0419cd140c6af35c78634a2d7c05dcf01
This commit is contained in:
Sean Dague 2013-07-24 20:04:26 -04:00
parent fcc4dd4995
commit c7a2bf51b7
2 changed files with 59 additions and 17 deletions

View File

@ -17,8 +17,10 @@
import cgi
import fileinput
import os.path
import re
import sys
import urllib
import wsgiref.util
@ -67,10 +69,15 @@ def escape_html(line):
def link_timestamp(line):
return re.sub('^(?P<span><span[^>]*>)?(?P<date>%s)' % DATEFMT,
('\g<span><a name="\g<date>" class="date" '
'href="#\g<date>">\g<date></a>'),
line)
m = re.match(
'(?P<span><span[^>]*>)?(?P<date>%s)(?P<rest>.*)' % DATEFMT,
line)
if m:
date = urllib.quote(m.group('date'))
return "%s<a name='%s' class='date' href='#%s'>%s</a>%s\n" % (
m.group('span'), date, date, m.group('date'), m.group('rest'))
else:
return line
def passthrough_filter(fname):
@ -104,20 +111,54 @@ def htmlify_stdin():
out.write(_html_close())
def application(environ, start_response):
status = '200 OK'
def safe_path(root, environ):
"""Pull out a save path from a url.
Basically we need to ensure that the final computed path
remains under the root path. If not, we return None to indicate
that we are very sad.
"""
path = wsgiref.util.request_uri(environ)
match = re.search('htmlify/(.*)', path)
# TODO(sdague): scrub all .. chars out of the path, for security reasons
fname = "/srv/static/logs/%s" % match.groups(1)[0]
if 'HTTP_ACCEPT' in environ and 'text/html' in environ['HTTP_ACCEPT']:
response_headers = [('Content-type', 'text/html')]
start_response(status, response_headers)
return html_filter(fname)
raw = match.groups(1)[0]
newpath = os.path.abspath("%s/%s" % (root, raw))
if newpath.find(root) == 0:
return newpath
else:
return None
def should_be_html(environ):
"""Simple content negotiation."""
return 'HTTP_ACCEPT' in environ and 'text/html' in environ['HTTP_ACCEPT']
def application(environ, start_response):
status = '200 OK'
logpath = safe_path('/srv/static/logs/', environ)
if not logpath:
status = '400 Bad Request'
response_headers = [('Content-type', 'text/plain')]
start_response(status, response_headers)
return passthrough_filter(fname)
return ['Invalid file url']
try:
if should_be_html(environ):
response_headers = [('Content-type', 'text/html')]
generator = html_filter(logpath)
start_response(status, response_headers)
return generator
else:
response_headers = [('Content-type', 'text/plain')]
generator = passthrough_filter(logpath)
start_response(status, response_headers)
return generator
except IOError:
status = "404 Not Found"
response_headers = [('Content-type', 'text/plain')]
start_response(status, response_headers)
return ['File Not Found']
# for development purposes, makes it easy to test the filter output

View File

@ -13,10 +13,6 @@ NameVirtualHost <%= vhost_name %>:<%= port %>
<% end -%>
DocumentRoot <%= docroot %>
RewriteEngine On
# rewrite all txt.gz files to map to our internal htmlify wsgi app
RewriteRule ^/(.*\.txt\.gz)$ /htmlify/$1 [QSA,L,PT]
WSGIScriptAlias /htmlify /usr/local/bin/htmlify-screen-log.py
# use Apache to compress the results afterwards, to save on the wire
# it's approx 18x savings of wire traffic to compress. We need to
# compress by content types that htmlify can produce
@ -40,6 +36,11 @@ NameVirtualHost <%= vhost_name %>:<%= port %>
ReadmeName /help/tempest-logs.html
</Directory>
RewriteEngine On
# rewrite all txt.gz files to map to our internal htmlify wsgi app
RewriteRule ^/(.*\.txt\.gz)$ /htmlify/$1 [QSA,L,PT]
WSGIScriptAlias /htmlify /usr/local/bin/htmlify-screen-log.py
ErrorLog /var/log/apache2/<%= name %>_error.log
LogLevel warn
CustomLog /var/log/apache2/<%= name %>_access.log combined