#!/usr/bin/env python3 """ Uploads sources and BLOBs to the AlmaLinux sources cache. """ import argparse import hashlib import os.path import sys from typing import Iterator, List, TextIO, Tuple import boto3 # type: ignore from botocore.exceptions import ClientError # type: ignore def init_arg_parser() -> argparse.ArgumentParser: """ Initializes a command line arguments parser. Returns: Command line arguments parser. """ arg_parser = argparse.ArgumentParser( prog="alma_blob_upload", description="Uploads sources and BLOBs to the AlmaLinux sources cache" ) group = arg_parser.add_mutually_exclusive_group(required=True) group.add_argument('-f', '--file', nargs='+', help='file(s) to upload') group.add_argument('-i', '--input-metadata', metavar='INPUT_FILE', help='input metadata file list to upload') arg_parser.add_argument('-b', '--bucket', default="sources.almalinux.org", help='Amazon S3 bucket name. Default is ' 'sources.almalinux.org') arg_parser.add_argument('-o', '--output-metadata', metavar='OUTPUT_FILE', help='output metadata file path') arg_parser.add_argument('-p', '--private', action='store_true', help='set uploaded file mode to private. All ' 'uploaded files are public by default') arg_parser.add_argument('--domain-name', default='sources.almalinux.org', help='AlmaLinux sources server domain name. ' 'Default is sources.almalinux.org') arg_parser.add_argument('-v', '--verbose', action='store_true', help='enable additional debug output') return arg_parser def get_file_checksum(file_path: str, checksum_type: str = 'sha1', buff_size: int = 1048576) -> str: """ Calculates a file checksum. Args: file_path: File path. checksum_type: Checksum type. buff_size: Number of bytes to read at once. Returns: File checksum. """ hasher = hashlib.new(checksum_type) with open(file_path, 'rb') as fd: buff = fd.read(buff_size) while len(buff): hasher.update(buff) buff = fd.read(buff_size) return hasher.hexdigest() def normalize_path(path: str) -> str: """ Returns an absolute path with all variables expanded. Args: path: Path to normalize. Returns: Normalized path. """ return os.path.abspath(os.path.expanduser(os.path.expandvars(path))) def iter_metadata(metadata_path: str) -> Iterator[Tuple[str, str]]: """ Iterates over records in a CentOS git repository-compatible metadata file. Args: metadata_path: Metadata file path. Returns: Iterator over files and their checksums. """ with open(metadata_path, 'r') as fd: for line in fd: checksum, file_path = line.split() file_path = normalize_path(file_path) assert checksum == get_file_checksum(file_path) yield file_path, checksum def iter_files(files: List[str]) -> Iterator[Tuple[str, str]]: """ Iterates over a list of files and calculates checksums for them. Args: files: List of files. Returns: Iterator over files and their checksums. """ for file_path in files: file_path = normalize_path(file_path) checksum = get_file_checksum(file_path) yield file_path, checksum def is_file_exist(s3_client, bucket_name: str, checksum: str) -> bool: """ Checks is a file with a given checksum is already uploaded. Args: s3_client: Amazon S3 client. bucket_name: S3 bucket name. checksum: File checksum. Returns: True if a file is already uploaded, False otherwise. """ try: s3_client.head_object(Bucket=bucket_name, Key=checksum) return True except ClientError: return False def upload_file(s3_client, bucket_name: str, file_path: str, checksum: str, private: bool): """ Uploads a file to an Amazon S3 bucket. Args: s3_client: Amazon S3 client. bucket_name: S3 bucket name. file_path: File path. checksum: File checksum. private: False if file should be public, True otherwise. """ acl = 'bucket-owner-full-control' if private else 'public-read' s3_client.upload_file(file_path, bucket_name, checksum, ExtraArgs={'ACL': acl}) def add_metadata_record(metadata_fd: TextIO, file_path: str, checksum: str): """ Adds a source file record to a metadata file. Args: metadata_fd: Metadata file descriptor. file_path: Source file path. checksum: Source file checksum. """ rel_path = os.path.relpath(file_path) metadata_fd.write(f'{checksum} {rel_path}\n') def main(sys_args): arg_parser = init_arg_parser() args = arg_parser.parse_args(sys_args) s3_client = boto3.client('s3') if args.input_metadata: iterator = iter_metadata(args.input_metadata) else: iterator = iter_files(args.file) out_fd = None if args.output_metadata: out_fd = open(args.output_metadata, 'w') try: for file_path, checksum in iterator: file_url = f'https://{args.domain_name}/{checksum}' if not is_file_exist(s3_client, args.bucket, checksum): upload_file(s3_client, args.bucket, file_path, checksum, args.private) print(f'{file_path} uploaded: {file_url}') else: print(f'{file_path} exists: {file_url}') if out_fd: add_metadata_record(out_fd, file_path, checksum) finally: if out_fd: out_fd.close() if __name__ == '__main__': sys.exit(main(sys.argv[1:]))