【AWS Glue】S3ファイル操作覚書
①フォルダ配下のファイルを削除する(Glue ETL)
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
glueContext.purge_s3_path("s3://s3-bucket-name/temp/", options={"retentionPeriod":0}, transformation_ctx="")
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
job.commit()
指定された Amazon S3 パスからファイルを再帰的に削除します
tempフォルダは残る
②ファイルリネーム(Python shell)
※実際はファイル名を指定してコピーしている
import sys
import boto3
import re
source_bucket = 's3-source-bucket'
source_prefix = 'original/'
target_bucket = 's3-target-bucket'
target_prefix = 'target/'
target_file = 'target_file.csv'
def copy_all_keys_v2(source_bucket='', source_prefix='', target_bucket='', target_prefix=''):
contents_count = 0
next_token = ''
while True:
if next_token == '':
response = s3client.list_objects_v2(Bucket=source_bucket, Prefix=source_prefix)
else:
response = s3client.list_objects_v2(Bucket=source_bucket, Prefix=source_prefix, ContinuationToken=next_token)
if 'Contents' in response:
contents = response['Contents']
contents_count = contents_count + len(contents)
for content in contents:
relative_prefix = re.sub('^' + source_prefix, '', content['Key'])
print('Copying: s3://' + source_bucket + '/' + content['Key'] + ' To s3://' + target_bucket + '/' + target_prefix + target_file)
s3client.copy_object(Bucket=target_bucket, Key=target_prefix + target_file, CopySource={'Bucket': source_bucket, 'Key': content['Key']})
if 'NextContinuationToken' in response:
next_token = response['NextContinuationToken']
else:
break
print(contents_count)
if __name__ == "__main__":
s3client = boto3.client('s3')
copy_all_keys_v2(source_bucket, source_prefix, target_bucket, target_prefix)