Quantcast
Viewing latest article 9
Browse Latest Browse All 10

Threaded Upload of PostgreSQL WAL Files to S3 Bucket

Bash script will spawn 10 separate parallel processes of s3-mp-upload.py, which itself is a multipart-parallelized python script that uses boto library to upload 16MB WAL files that are older than 8 hours in /pg_wal folder on our database . The S3 bucket is set to store 3 months of WAL files and anything older goes in Amazon Glacier. Files older than 8 hours are deleted from the database. We are sure to set the x-amz-server-side-encryption:AES256 header to make use of S3′s server side AES256 encryption.

#!/bin/bash

# this script runs in cron.daily
# purpose: find files in $PG_WAL directory that are older than 8 hours
#          and put those files in s3://$S3_BUCKET/. Those files are then
#          deleted, to free space for more WAL to accumulate in $PG_WAL.
# note:    each WAL file is 16MB
# requirements: s3-mp-upload.py in chaturbate_system_files/backups/
#               python
#               boto (python module)
#               argparse (python module)
#               make sure to set server side encryption header in S3 
#               s3cmd to make sure file exists before deleting in s3.

export AWS_ACCESS_KEY_ID=XXXXXX
export AWS_SECRET_ACCESS_KEY=XXXXXX
export ENCRYPT=x-amz-server-side-encryption:AES256
export PATH=/usr/local/bin:$PATH

S3_MP_BIN=/root/system_files/backups/s3-mp-upload.py
S3_BUCKET=pgwal.mybucket.com
PG_WAL=/pg_wal
S3_THREADS=3 #s3-mp-upload threads
WAL_AGE=480 #480 minutes = 8 hours
THREADS=10 #bash threads
CHUNKSIZE=5
QUIET="" #use -q for crons

CNT=1
for i in `find $PG_WAL -type f -mmin +${WAL_AGE}`; do
        if [ $(($CNT % $THREADS)) -eq 0 ]; then
                #wait for spawned uploads x $THREADS to finish
                wait
                for j in $(seq 1 $(($THREADS-1))); do
                        if [ "`s3cmd ls s3://${S3_BUCKET}/${wal[$j]} | awk '{ print $3}'`" != "16777216" ]; then
                                # is this a .backup file?
                                [ `echo ${wal[$j]} | awk -F\. '{print $2}'` ] && continue;

                                echo "problem uploading ${wal[$j]} to $S3_BUCKET!"
                                echo "`s3cmd ls s3://${S3_BUCKET}/${wal[$j]}` -- not 16777216 bytes!"
                                exit 0
                        else
                                $QUIET && echo rm -f $PG_WAL/${wal[$j]}
                                rm -f $PG_WAL/${wal[$j]}
                        fi
                done
        else
                f=`basename $i`
                for j in $(seq 1 $(($THREADS-1))); do
                        [ $(($CNT % $THREADS)) -eq $j ] && wal[$j]=$f
                done
                $S3_MP_BIN -f $QUIET -np $S3_THREADS -s $CHUNKSIZE $i s3://$S3_BUCKET/$f &
        fi
        CNT=$(($CNT + 1))
done

Here’s a copy of s3-mp-upload.py (patched to use the encryption header):

#!/usr/bin/env python
import argparse
from cStringIO import StringIO
import logging
from math import ceil
from multiprocessing import Pool
import time
import urlparse

import boto

parser = argparse.ArgumentParser(description="Transfer large files to S3",
        prog="s3-mp-upload")
parser.add_argument("src", type=file, help="The file to transfer")
parser.add_argument("dest", help="The S3 destination object")
parser.add_argument("-np", "--num-processes", help="Number of processors to use",
        type=int, default=2)
parser.add_argument("-f", "--force", help="Overwrite an existing S3 key",
        action="store_true")
parser.add_argument("-s", "--split", help="Split size, in Mb", type=int, default=50)
parser.add_argument("-rrs", "--reduced-redundancy", help="Use reduced redundancy storage. Default is standard.", default=False,  action="store_true")
parser.add_argument("-v", "--verbose", help="Be more verbose", default=False, action="store_true")
parser.add_argument("-q", "--quiet", help="Be less verbose (for use in cron jobs)", default=False, action="store_true")

logger = logging.getLogger("s3-mp-upload")

def do_part_upload(args):
    """
    Upload a part of a MultiPartUpload

    Open the target file and read in a chunk. Since we can't pickle
    S3Connection or MultiPartUpload objects, we have to reconnect and lookup
    the MPU object with each part upload.

    :type args: tuple of (string, string, string, int, int, int)
    :param args: The actual arguments of this method. Due to lameness of
                 multiprocessing, we have to extract these outside of the
                 function definition.

                 The arguments are: S3 Bucket name, MultiPartUpload id, file
                 name, the part number, part offset, part size
    """
    # Multiprocessing args lameness
    bucket_name, mpu_id, fname, i, start, size = args
    logger.debug("do_part_upload got args: %s" % (args,))

    # Connect to S3, get the MultiPartUpload
    s3 = boto.connect_s3()
    bucket = s3.lookup(bucket_name)
    mpu = None
    for mp in bucket.list_multipart_uploads():
        if mp.id == mpu_id:
            mpu = mp
            break
    if mpu is None:
        raise Exception("Could not find MultiPartUpload %s" % mpu_id)

    # Read the chunk from the file
    fp = open(fname, 'rb')
    fp.seek(start)
    data = fp.read(size)
    fp.close()
    if not data:
        raise Exception("Unexpectedly tried to read an empty chunk")

    def progress(x,y):
        logger.debug("Part %d: %0.2f%%" % (i+1, 100.*x/y))

    # Do the upload
    t1 = time.time()
    mpu.upload_part_from_file(StringIO(data), i+1, cb=progress)

    # Print some timings
    t2 = time.time() - t1
    s = len(data)/1024./1024.
    logger.info("Uploaded part %s (%0.2fM) in %0.2fs at %0.2fMbps" % (i+1, s, t2, s/t2))

def main(src, dest, num_processes=8, split=50, force=False, reduced_redundancy=False, verbose=False, quiet=False):
    # Check that dest is a valid S3 url
    split_rs = urlparse.urlsplit(dest)
    if split_rs.scheme != "s3":
        raise ValueError("'%s' is not an S3 url" % dest)

    s3 = boto.connect_s3()
    bucket = s3.lookup(split_rs.netloc)
    key = bucket.get_key(split_rs.path)
    # See if we're overwriting an existing key
    if key is not None:
        if not force:
            raise ValueError("'%s' already exists. Specify -f to overwrite it" % dest)

    # Determine the splits
    part_size = max(5*1024*1024, 1024*1024*split)
    src.seek(0,2)
    size = src.tell()
    num_parts = int(ceil(size / part_size))

    # If file is less than 5M, just upload it directly
    if size < 5*1024*1024:
        src.seek(0)
        t1 = time.time()
        k = boto.s3.key.Key(bucket,split_rs.path)
        k.set_contents_from_file(src, encrypt_key=True)
        t2 = time.time() - t1
        s = size/1024./1024.
        logger.info("Finished uploading %0.2fM in %0.2fs (%0.2fMbps)" % (s, t2, s/t2))
        return

    # Create the multi-part upload object
    mpu = bucket.initiate_multipart_upload(split_rs.path, reduced_redundancy=reduced_redundancy, encrypt_key=True)
    logger.info("Initialized upload: %s" % mpu.id)

    # Generate arguments for invocations of do_part_upload
    def gen_args(num_parts, fold_last):
        for i in range(num_parts+1):
            part_start = part_size*i
            if i == (num_parts-1) and fold_last is True:
                yield (bucket.name, mpu.id, src.name, i, part_start, part_size*2)
                break
            else:
                yield (bucket.name, mpu.id, src.name, i, part_start, part_size)


    # If the last part is less than 5M, just fold it into the previous part
    fold_last = ((size % part_size) < 5*1024*1024)

    # Do the thing
    try:
        # Create a pool of workers
        pool = Pool(processes=num_processes)
        t1 = time.time()
        pool.map_async(do_part_upload, gen_args(num_parts, fold_last)).get(9999999)
        # Print out some timings
        t2 = time.time() - t1
        s = size/1024./1024.
        # Finalize
        src.close()
        mpu.complete_upload()
        logger.info("Finished uploading %0.2fM in %0.2fs (%0.2fMbps)" % (s, t2, s/t2))
    except KeyboardInterrupt:
        logger.warn("Received KeyboardInterrupt, canceling upload")
        pool.terminate()
        mpu.cancel_upload()
    except Exception, err:
        logger.error("Encountered an error, canceling upload")
        logger.error(err)
        mpu.cancel_upload()

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    args = parser.parse_args()
    arg_dict = vars(args)
    if arg_dict['quiet'] == True:
        logger.setLevel(logging.WARNING)
    if arg_dict['verbose'] == True:
        logger.setLevel(logging.DEBUG)
    logger.debug("CLI args: %s" % args)
    main(**arg_dict)

Viewing latest article 9
Browse Latest Browse All 10

Trending Articles