From 215cd551df8be066edafd2a1e16d0bd143ec214b Mon Sep 17 00:00:00 2001
From: Samuel Merritt <sam@swiftstack.com>
Date: Tue, 21 Apr 2015 17:38:04 -0700
Subject: [PATCH] Bulk upload: treat user xattrs as object metadata

Currently, if you PUT a single object, then you can also associate
metadata with it by putting it in the request headers, prefixed with
"X-Object-Meta". However, if you're bulk-uploading objects, then you
have no way to assign any metadata.

The tar file format* allows for arbitrary UTF-8 key/value pairs to be
associated with each file in an archive (as well as with the archive
itself, but we don't care about that here). If a file has extended
attributes, then tar will store those as key/value pairs.

This commit makes bulk upload read those extended attributes, if
present, and convert those to Swift object metadata. Attributes
starting with "user.meta" are converted to object metadata, and
"user.mime_type"** is converted to Content-Type.

For example, if you have a file "setup.py":

    $ setfattr -n user.mime_type -v "application/python-setup" setup.py
    $ setfattr -n user.meta.lunch -v "burger and fries" setup.py
    $ setfattr -n user.meta.dinner -v "baked ziti" setup.py
    $ setfattr -n user.stuff -v "whee" setup.py

This will get translated to headers:

    Content-Type: application/python-setup
    X-Object-Meta-Lunch: burger and fries
    X-Object-Meta-Dinner: baked ziti

Swift will handle xattrs stored by both GNU and BSD tar***. Only
xattrs user.mime_type and user.meta.* are processed; others are
ignored.

This brings bulk upload much closer to feature-parity with non-bulk upload.

* The POSIX 1003.1-2001 (pax) format, at least. There are a few
  different, mutually-incompatible tar formats out there, because of
  course there are. This is the default format on GNU tar 1.27.1 or
  later.

** http://standards.freedesktop.org/shared-mime-info-spec/latest/ar01s02.html#idm140622087713936

*** Even with pax-format tarballs, different encoders store xattrs
    slightly differently; for example, GNU tar stores the xattr
    "user.rubberducky" as pax header "SCHILY.xattr.user.rubberducky",
    while BSD tar (which uses libarchive) stores it as
    "LIBARCHIVE.xattr.user.rubberducky". One might wonder if this is
    some programmer's attempt at job security.

Change-Id: I5e3ce87d31054f5239e86d47c45adbde2bb93640
---
 swift/common/middleware/bulk.py          |  27 ++++++
 test/unit/common/middleware/test_bulk.py | 103 ++++++++++++++++++++++-
 2 files changed, 129 insertions(+), 1 deletion(-)

diff --git a/swift/common/middleware/bulk.py b/swift/common/middleware/bulk.py
index 7dc69b6ff1..888ff2356a 100644
--- a/swift/common/middleware/bulk.py
+++ b/swift/common/middleware/bulk.py
@@ -75,6 +75,23 @@ def get_response_body(data_format, data_dict, error_list):
     return output
 
 
+def pax_key_to_swift_header(pax_key):
+    if (pax_key == u"SCHILY.xattr.user.mime_type" or
+            pax_key == u"LIBARCHIVE.xattr.user.mime_type"):
+        return "Content-Type"
+    elif pax_key.startswith(u"SCHILY.xattr.user.meta."):
+        useful_part = pax_key[len(u"SCHILY.xattr.user.meta."):]
+        return "X-Object-Meta-" + useful_part.encode("utf-8")
+    elif pax_key.startswith(u"LIBARCHIVE.xattr.user.meta."):
+        useful_part = pax_key[len(u"LIBARCHIVE.xattr.user.meta."):]
+        return "X-Object-Meta-" + useful_part.encode("utf-8")
+    else:
+        # You can get things like atime/mtime/ctime or filesystem ACLs in
+        # pax headers; those aren't really user metadata. The same goes for
+        # other, non-user metadata.
+        return None
+
+
 class Bulk(object):
     """
     Middleware that will do many operations on a single request.
@@ -464,6 +481,16 @@ class Bulk(object):
                     new_env['HTTP_USER_AGENT'] = \
                         '%s BulkExpand' % req.environ.get('HTTP_USER_AGENT')
                     create_obj_req = Request.blank(destination, new_env)
+
+                    for pax_key, pax_value in tar_info.pax_headers.items():
+                        header_name = pax_key_to_swift_header(pax_key)
+                        if header_name:
+                            # Both pax_key and pax_value are unicode
+                            # strings; the key is already UTF-8 encoded, but
+                            # we still have to encode the value.
+                            create_obj_req.headers[header_name] = \
+                                pax_value.encode("utf-8")
+
                     resp = create_obj_req.get_response(self.app)
                     containers_accessed.add(container)
                     if resp.is_success:
diff --git a/test/unit/common/middleware/test_bulk.py b/test/unit/common/middleware/test_bulk.py
index 0f0b83a7d4..2bd0b78158 100644
--- a/test/unit/common/middleware/test_bulk.py
+++ b/test/unit/common/middleware/test_bulk.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # Copyright (c) 2012 OpenStack Foundation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -25,9 +26,11 @@ from tempfile import mkdtemp
 from StringIO import StringIO
 from eventlet import sleep
 from mock import patch, call
+from test.unit.common.middleware.helpers import FakeSwift
 from swift.common import utils, constraints
 from swift.common.middleware import bulk
-from swift.common.swob import Request, Response, HTTPException
+from swift.common.swob import Request, Response, HTTPException, \
+    HTTPNoContent, HTTPCreated, HeaderKeyDict
 from swift.common.http import HTTP_NOT_FOUND, HTTP_UNAUTHORIZED
 
 
@@ -126,6 +129,104 @@ def build_tar_tree(tar, start_path, tree_obj, base_path=''):
         tar.addfile(tar_info)
 
 
+class TestUntarMetadata(unittest.TestCase):
+    def setUp(self):
+        self.app = FakeSwift()
+        self.bulk = bulk.filter_factory({})(self.app)
+        self.testdir = mkdtemp(suffix='tmp_test_bulk')
+
+    def tearDown(self):
+        rmtree(self.testdir, ignore_errors=1)
+
+    def test_extract_metadata(self):
+        self.app.register('HEAD', '/v1/a/c?extract-archive=tar',
+                          HTTPNoContent, {}, None)
+        self.app.register('PUT', '/v1/a/c/obj1?extract-archive=tar',
+                          HTTPCreated, {}, None)
+        self.app.register('PUT', '/v1/a/c/obj2?extract-archive=tar',
+                          HTTPCreated, {}, None)
+
+        # It's a real pain to instantiate TarInfo objects directly; they
+        # really want to come from a file on disk or a tarball. So, we write
+        # out some files and add pax headers to them as they get placed into
+        # the tarball.
+        with open(os.path.join(self.testdir, "obj1"), "w") as fh1:
+            fh1.write("obj1 contents\n")
+        with open(os.path.join(self.testdir, "obj2"), "w") as fh2:
+            fh2.write("obj2 contents\n")
+
+        tar_ball = StringIO()
+        tar_file = tarfile.TarFile.open(fileobj=tar_ball, mode="w",
+                                        format=tarfile.PAX_FORMAT)
+
+        # With GNU tar 1.27.1 or later (possibly 1.27 as well), a file with
+        # extended attribute user.thingy = dingy gets put into the tarfile
+        # with pax_headers containing key/value pair
+        # (SCHILY.xattr.user.thingy, dingy), both unicode strings (py2: type
+        # unicode, not type str).
+        #
+        # With BSD tar (libarchive), you get key/value pair
+        # (LIBARCHIVE.xattr.user.thingy, dingy), which strikes me as
+        # gratuitous incompatibility.
+        #
+        # Still, we'll support uploads with both. Just heap more code on the
+        # problem until you can forget it's under there.
+        with open(os.path.join(self.testdir, "obj1")) as fh1:
+            tar_info1 = tar_file.gettarinfo(fileobj=fh1,
+                                            arcname="obj1")
+            tar_info1.pax_headers[u'SCHILY.xattr.user.mime_type'] = \
+                u'application/food-diary'
+            tar_info1.pax_headers[u'SCHILY.xattr.user.meta.lunch'] = \
+                u'sopa de albóndigas'
+            tar_info1.pax_headers[
+                u'SCHILY.xattr.user.meta.afternoon-snack'] = \
+                u'gigantic bucket of coffee'
+            tar_file.addfile(tar_info1, fh1)
+
+        with open(os.path.join(self.testdir, "obj2")) as fh2:
+            tar_info2 = tar_file.gettarinfo(fileobj=fh2,
+                                            arcname="obj2")
+            tar_info2.pax_headers[
+                u'LIBARCHIVE.xattr.user.meta.muppet'] = u'bert'
+            tar_info2.pax_headers[
+                u'LIBARCHIVE.xattr.user.meta.cat'] = u'fluffy'
+            tar_info2.pax_headers[
+                u'LIBARCHIVE.xattr.user.notmeta'] = u'skipped'
+            tar_file.addfile(tar_info2, fh2)
+
+        tar_ball.seek(0)
+
+        req = Request.blank('/v1/a/c?extract-archive=tar')
+        req.environ['REQUEST_METHOD'] = 'PUT'
+        req.environ['wsgi.input'] = tar_ball
+        req.headers['transfer-encoding'] = 'chunked'
+        req.headers['accept'] = 'application/json;q=1.0'
+
+        resp = req.get_response(self.bulk)
+        self.assertEqual(resp.status_int, 200)
+
+        # sanity check to make sure the upload worked
+        upload_status = utils.json.loads(resp.body)
+        self.assertEqual(upload_status['Number Files Created'], 2)
+
+        put1_headers = HeaderKeyDict(self.app.calls_with_headers[1][2])
+        self.assertEqual(
+            put1_headers.get('Content-Type'),
+            'application/food-diary')
+        self.assertEqual(
+            put1_headers.get('X-Object-Meta-Lunch'),
+            'sopa de alb\xc3\xb3ndigas')
+        self.assertEqual(
+            put1_headers.get('X-Object-Meta-Afternoon-Snack'),
+            'gigantic bucket of coffee')
+
+        put2_headers = HeaderKeyDict(self.app.calls_with_headers[2][2])
+        self.assertEqual(put2_headers.get('X-Object-Meta-Muppet'), 'bert')
+        self.assertEqual(put2_headers.get('X-Object-Meta-Cat'), 'fluffy')
+        self.assertEqual(put2_headers.get('Content-Type'), None)
+        self.assertEqual(put2_headers.get('X-Object-Meta-Blah'), None)
+
+
 class TestUntar(unittest.TestCase):
 
     def setUp(self):