almalinux-git-utils/alma_blob_upload.py

194 lines
5.9 KiB
Python

#!/usr/bin/env python3
"""
Uploads sources and BLOBs to the AlmaLinux sources cache.
"""
import argparse
import hashlib
import os.path
import sys
from typing import Iterator, List, TextIO, Tuple
import boto3 # type: ignore
from botocore.exceptions import ClientError # type: ignore
def init_arg_parser() -> argparse.ArgumentParser:
"""
Initializes a command line arguments parser.
Returns:
Command line arguments parser.
"""
arg_parser = argparse.ArgumentParser(
prog="alma_blob_upload",
description="Uploads sources and BLOBs to the AlmaLinux sources cache"
)
group = arg_parser.add_mutually_exclusive_group(required=True)
group.add_argument('-f', '--file', nargs='+', help='file(s) to upload')
group.add_argument('-i', '--input-metadata', metavar='INPUT_FILE',
help='input metadata file list to upload')
arg_parser.add_argument('-b', '--bucket', default="sources.almalinux.org",
help='Amazon S3 bucket name. Default is '
'sources.almalinux.org')
arg_parser.add_argument('-o', '--output-metadata', metavar='OUTPUT_FILE',
help='output metadata file path')
arg_parser.add_argument('-p', '--private', action='store_true',
help='set uploaded file mode to private. All '
'uploaded files are public by default')
arg_parser.add_argument('--domain-name', default='sources.almalinux.org',
help='AlmaLinux sources server domain name. '
'Default is sources.almalinux.org')
arg_parser.add_argument('-v', '--verbose', action='store_true',
help='enable additional debug output')
return arg_parser
def get_file_checksum(file_path: str, checksum_type: str = 'sha1',
buff_size: int = 1048576) -> str:
"""
Calculates a file checksum.
Args:
file_path: File path.
checksum_type: Checksum type.
buff_size: Number of bytes to read at once.
Returns:
File checksum.
"""
hasher = hashlib.new(checksum_type)
with open(file_path, 'rb') as fd:
buff = fd.read(buff_size)
while len(buff):
hasher.update(buff)
buff = fd.read(buff_size)
return hasher.hexdigest()
def normalize_path(path: str) -> str:
"""
Returns an absolute path with all variables expanded.
Args:
path: Path to normalize.
Returns:
Normalized path.
"""
return os.path.abspath(os.path.expanduser(os.path.expandvars(path)))
def iter_metadata(metadata_path: str) -> Iterator[Tuple[str, str]]:
"""
Iterates over records in a CentOS git repository-compatible metadata file.
Args:
metadata_path: Metadata file path.
Returns:
Iterator over files and their checksums.
"""
with open(metadata_path, 'r') as fd:
for line in fd:
checksum, file_path = line.split()
file_path = normalize_path(file_path)
assert checksum == get_file_checksum(file_path)
yield file_path, checksum
def iter_files(files: List[str]) -> Iterator[Tuple[str, str]]:
"""
Iterates over a list of files and calculates checksums for them.
Args:
files: List of files.
Returns:
Iterator over files and their checksums.
"""
for file_path in files:
file_path = normalize_path(file_path)
checksum = get_file_checksum(file_path)
yield file_path, checksum
def is_file_exist(s3_client, bucket_name: str, checksum: str) -> bool:
"""
Checks is a file with a given checksum is already uploaded.
Args:
s3_client: Amazon S3 client.
bucket_name: S3 bucket name.
checksum: File checksum.
Returns:
True if a file is already uploaded, False otherwise.
"""
try:
s3_client.head_object(Bucket=bucket_name, Key=checksum)
return True
except ClientError:
return False
def upload_file(s3_client, bucket_name: str, file_path: str,
checksum: str, private: bool):
"""
Uploads a file to an Amazon S3 bucket.
Args:
s3_client: Amazon S3 client.
bucket_name: S3 bucket name.
file_path: File path.
checksum: File checksum.
private: False if file should be public, True otherwise.
"""
acl = 'bucket-owner-full-control' if private else 'public-read'
s3_client.upload_file(file_path, bucket_name, checksum,
ExtraArgs={'ACL': acl})
def add_metadata_record(metadata_fd: TextIO, file_path: str, checksum: str):
"""
Adds a source file record to a metadata file.
Args:
metadata_fd: Metadata file descriptor.
file_path: Source file path.
checksum: Source file checksum.
"""
rel_path = os.path.relpath(file_path)
metadata_fd.write(f'{checksum} {rel_path}\n')
def main(sys_args):
arg_parser = init_arg_parser()
args = arg_parser.parse_args(sys_args)
s3_client = boto3.client('s3')
if args.input_metadata:
iterator = iter_metadata(args.input_metadata)
else:
iterator = iter_files(args.file)
out_fd = None
if args.output_metadata:
out_fd = open(args.output_metadata, 'w')
try:
for file_path, checksum in iterator:
file_url = f'https://{args.domain_name}/{checksum}'
if not is_file_exist(s3_client, args.bucket, checksum):
upload_file(s3_client, args.bucket, file_path, checksum,
args.private)
print(f'{file_path} uploaded: {file_url}')
else:
print(f'{file_path} exists: {file_url}')
if out_fd:
add_metadata_record(out_fd, file_path, checksum)
finally:
if out_fd:
out_fd.close()
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))