Bash script will spawn 10 separate parallel processes of s3-mp-upload.py, which itself is a multipart-parallelized python script that uses boto library to upload 16MB WAL files that are older than 8 hours in /pg_wal folder on our database . The S3 bucket is set to store 3 months of WAL files and anything older goes in Amazon Glacier. Files older than 8 hours are deleted from the database. We are sure to set the x-amz-server-side-encryption:AES256 header to make use of S3′s server side AES256 encryption.
#!/bin/bash # this script runs in cron.daily # purpose: find files in $PG_WAL directory that are older than 8 hours # and put those files in s3://$S3_BUCKET/. Those files are then # deleted, to free space for more WAL to accumulate in $PG_WAL. # note: each WAL file is 16MB # requirements: s3-mp-upload.py in chaturbate_system_files/backups/ # python # boto (python module) # argparse (python module) # make sure to set server side encryption header in S3 # s3cmd to make sure file exists before deleting in s3. export AWS_ACCESS_KEY_ID=XXXXXX export AWS_SECRET_ACCESS_KEY=XXXXXX export ENCRYPT=x-amz-server-side-encryption:AES256 export PATH=/usr/local/bin:$PATH S3_MP_BIN=/root/system_files/backups/s3-mp-upload.py S3_BUCKET=pgwal.mybucket.com PG_WAL=/pg_wal S3_THREADS=3 #s3-mp-upload threads WAL_AGE=480 #480 minutes = 8 hours THREADS=10 #bash threads CHUNKSIZE=5 QUIET="" #use -q for crons CNT=1 for i in `find $PG_WAL -type f -mmin +${WAL_AGE}`; do if [ $(($CNT % $THREADS)) -eq 0 ]; then #wait for spawned uploads x $THREADS to finish wait for j in $(seq 1 $(($THREADS-1))); do if [ "`s3cmd ls s3://${S3_BUCKET}/${wal[$j]} | awk '{ print $3}'`" != "16777216" ]; then # is this a .backup file? [ `echo ${wal[$j]} | awk -F\. '{print $2}'` ] && continue; echo "problem uploading ${wal[$j]} to $S3_BUCKET!" echo "`s3cmd ls s3://${S3_BUCKET}/${wal[$j]}` -- not 16777216 bytes!" exit 0 else $QUIET && echo rm -f $PG_WAL/${wal[$j]} rm -f $PG_WAL/${wal[$j]} fi done else f=`basename $i` for j in $(seq 1 $(($THREADS-1))); do [ $(($CNT % $THREADS)) -eq $j ] && wal[$j]=$f done $S3_MP_BIN -f $QUIET -np $S3_THREADS -s $CHUNKSIZE $i s3://$S3_BUCKET/$f & fi CNT=$(($CNT + 1)) done
Here’s a copy of s3-mp-upload.py (patched to use the encryption header):
#!/usr/bin/env python import argparse from cStringIO import StringIO import logging from math import ceil from multiprocessing import Pool import time import urlparse import boto parser = argparse.ArgumentParser(description="Transfer large files to S3", prog="s3-mp-upload") parser.add_argument("src", type=file, help="The file to transfer") parser.add_argument("dest", help="The S3 destination object") parser.add_argument("-np", "--num-processes", help="Number of processors to use", type=int, default=2) parser.add_argument("-f", "--force", help="Overwrite an existing S3 key", action="store_true") parser.add_argument("-s", "--split", help="Split size, in Mb", type=int, default=50) parser.add_argument("-rrs", "--reduced-redundancy", help="Use reduced redundancy storage. Default is standard.", default=False, action="store_true") parser.add_argument("-v", "--verbose", help="Be more verbose", default=False, action="store_true") parser.add_argument("-q", "--quiet", help="Be less verbose (for use in cron jobs)", default=False, action="store_true") logger = logging.getLogger("s3-mp-upload") def do_part_upload(args): """ Upload a part of a MultiPartUpload Open the target file and read in a chunk. Since we can't pickle S3Connection or MultiPartUpload objects, we have to reconnect and lookup the MPU object with each part upload. :type args: tuple of (string, string, string, int, int, int) :param args: The actual arguments of this method. Due to lameness of multiprocessing, we have to extract these outside of the function definition. The arguments are: S3 Bucket name, MultiPartUpload id, file name, the part number, part offset, part size """ # Multiprocessing args lameness bucket_name, mpu_id, fname, i, start, size = args logger.debug("do_part_upload got args: %s" % (args,)) # Connect to S3, get the MultiPartUpload s3 = boto.connect_s3() bucket = s3.lookup(bucket_name) mpu = None for mp in bucket.list_multipart_uploads(): if mp.id == mpu_id: mpu = mp break if mpu is None: raise Exception("Could not find MultiPartUpload %s" % mpu_id) # Read the chunk from the file fp = open(fname, 'rb') fp.seek(start) data = fp.read(size) fp.close() if not data: raise Exception("Unexpectedly tried to read an empty chunk") def progress(x,y): logger.debug("Part %d: %0.2f%%" % (i+1, 100.*x/y)) # Do the upload t1 = time.time() mpu.upload_part_from_file(StringIO(data), i+1, cb=progress) # Print some timings t2 = time.time() - t1 s = len(data)/1024./1024. logger.info("Uploaded part %s (%0.2fM) in %0.2fs at %0.2fMbps" % (i+1, s, t2, s/t2)) def main(src, dest, num_processes=8, split=50, force=False, reduced_redundancy=False, verbose=False, quiet=False): # Check that dest is a valid S3 url split_rs = urlparse.urlsplit(dest) if split_rs.scheme != "s3": raise ValueError("'%s' is not an S3 url" % dest) s3 = boto.connect_s3() bucket = s3.lookup(split_rs.netloc) key = bucket.get_key(split_rs.path) # See if we're overwriting an existing key if key is not None: if not force: raise ValueError("'%s' already exists. Specify -f to overwrite it" % dest) # Determine the splits part_size = max(5*1024*1024, 1024*1024*split) src.seek(0,2) size = src.tell() num_parts = int(ceil(size / part_size)) # If file is less than 5M, just upload it directly if size < 5*1024*1024: src.seek(0) t1 = time.time() k = boto.s3.key.Key(bucket,split_rs.path) k.set_contents_from_file(src, encrypt_key=True) t2 = time.time() - t1 s = size/1024./1024. logger.info("Finished uploading %0.2fM in %0.2fs (%0.2fMbps)" % (s, t2, s/t2)) return # Create the multi-part upload object mpu = bucket.initiate_multipart_upload(split_rs.path, reduced_redundancy=reduced_redundancy, encrypt_key=True) logger.info("Initialized upload: %s" % mpu.id) # Generate arguments for invocations of do_part_upload def gen_args(num_parts, fold_last): for i in range(num_parts+1): part_start = part_size*i if i == (num_parts-1) and fold_last is True: yield (bucket.name, mpu.id, src.name, i, part_start, part_size*2) break else: yield (bucket.name, mpu.id, src.name, i, part_start, part_size) # If the last part is less than 5M, just fold it into the previous part fold_last = ((size % part_size) < 5*1024*1024) # Do the thing try: # Create a pool of workers pool = Pool(processes=num_processes) t1 = time.time() pool.map_async(do_part_upload, gen_args(num_parts, fold_last)).get(9999999) # Print out some timings t2 = time.time() - t1 s = size/1024./1024. # Finalize src.close() mpu.complete_upload() logger.info("Finished uploading %0.2fM in %0.2fs (%0.2fMbps)" % (s, t2, s/t2)) except KeyboardInterrupt: logger.warn("Received KeyboardInterrupt, canceling upload") pool.terminate() mpu.cancel_upload() except Exception, err: logger.error("Encountered an error, canceling upload") logger.error(err) mpu.cancel_upload() if __name__ == "__main__": logging.basicConfig(level=logging.INFO) args = parser.parse_args() arg_dict = vars(args) if arg_dict['quiet'] == True: logger.setLevel(logging.WARNING) if arg_dict['verbose'] == True: logger.setLevel(logging.DEBUG) logger.debug("CLI args: %s" % args) main(**arg_dict)