diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e9ad2d8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea +.mypy_cache diff --git a/README.md b/README.md index 7400d93..2197cdd 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,65 @@ # almalinux-git-utils -Utilities for working with the AlmaLinux OS Git server. \ No newline at end of file +Utilities for working with the AlmaLinux OS Git server. + + +## alma_blob_upload + +The `alma_blob_upload` script uploads sources and BLOBs to the AlmaLinux +sources cache. + +### Prerequirements + +Install the `python3-boto3` package: + +```shell +# RPM-based distributions. On EL8 derivatives the package is available from EPEL. +$ sudo dnf install python3 python3-boto3 + +# Debian-based distributions +$ sudo apt install python3-boto3 +``` + +Create an AWS credentials file ~/.aws/credentials with the following content: + +```ini +[default] +aws_access_key_id = YOUR_ACCESS_KEY +aws_secret_access_key = YOUR_SECRET_KEY +``` + + +### Usage + +The utility supports two types of input: a CentOS git repository metadata file +or a list of files to upload. + +For CentOS repositories workflow will be the following: + +1. Install the `get_sources.sh` script from the + [centos-git-common](https://git.centos.org/centos-git-common) repository. +2. Clone a project and download its sources as described on the CentOS + [Wiki](https://wiki.centos.org/Sources). +3. Run the `alma_blob_upload` tool (don't forget to replace `PROJECT_NAME` with + an actual project name): + ```shell + $ alma_blob_upload.py -i .PROJECT_NAME.metadata + ``` + +Alternatively, you can upload a list of files in the following way: + +```shell +$ alma_blob_upload.py -f SOURCES/FILE_1 SOURCES/FILE_N +``` + +The `alma_blob_upload` utility can also generate a CentOS-compatible metadata +file: + +``` +$ alma_blob_upload.py -o .PROJECT_NAME.metadata -f SOURCES/FILE_1 SOURCES/FILE_N +``` + + +## License + +Licensed under the GPLv3 license, see the LICENSE file for details. diff --git a/alma_blob_upload.py b/alma_blob_upload.py new file mode 100644 index 0000000..cd6d42b --- /dev/null +++ b/alma_blob_upload.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 + +""" +Uploads sources and BLOBs to the AlmaLinux sources cache. +""" + +import argparse +import hashlib +import os.path +import sys +from typing import Iterator, List, TextIO, Tuple + +import boto3 # type: ignore +from botocore.exceptions import ClientError # type: ignore + + +def init_arg_parser() -> argparse.ArgumentParser: + """ + Initializes a command line arguments parser. + + Returns: + Command line arguments parser. + """ + arg_parser = argparse.ArgumentParser( + prog="alma_blob_upload", + description="Uploads sources and BLOBs to the AlmaLinux sources cache" + ) + group = arg_parser.add_mutually_exclusive_group(required=True) + group.add_argument('-f', '--file', nargs='+', help='file(s) to upload') + group.add_argument('-i', '--input-metadata', metavar='INPUT_FILE', + help='input metadata file list to upload') + arg_parser.add_argument('-b', '--bucket', default="sources.almalinux.org", + help='Amazon S3 bucket name. Default is ' + 'sources.almalinux.org') + arg_parser.add_argument('-o', '--output-metadata', metavar='OUTPUT_FILE', + help='output metadata file path') + arg_parser.add_argument('-p', '--private', action='store_true', + help='set uploaded file mode to private. All ' + 'uploaded files are public by default') + arg_parser.add_argument('--domain-name', default='sources.almalinux.org', + help='AlmaLinux sources server domain name. ' + 'Default is sources.almalinux.org') + arg_parser.add_argument('-v', '--verbose', action='store_true', + help='enable additional debug output') + return arg_parser + + +def get_file_checksum(file_path: str, checksum_type: str = 'sha1', + buff_size: int = 1048576) -> str: + """ + Calculates a file checksum. + + Args: + file_path: File path. + checksum_type: Checksum type. + buff_size: Number of bytes to read at once. + + Returns: + File checksum. + """ + hasher = hashlib.new(checksum_type) + with open(file_path, 'rb') as fd: + buff = fd.read(buff_size) + while len(buff): + hasher.update(buff) + buff = fd.read(buff_size) + return hasher.hexdigest() + + +def normalize_path(path: str) -> str: + """ + Returns an absolute path with all variables expanded. + + Args: + path: Path to normalize. + + Returns: + Normalized path. + """ + return os.path.abspath(os.path.expanduser(os.path.expandvars(path))) + + +def iter_metadata(metadata_path: str) -> Iterator[Tuple[str, str]]: + """ + Iterates over records in a CentOS git repository-compatible metadata file. + + Args: + metadata_path: Metadata file path. + + Returns: + Iterator over files and their checksums. + """ + with open(metadata_path, 'r') as fd: + for line in fd: + checksum, file_path = line.split() + file_path = normalize_path(file_path) + assert checksum == get_file_checksum(file_path) + yield file_path, checksum + + +def iter_files(files: List[str]) -> Iterator[Tuple[str, str]]: + """ + Iterates over a list of files and calculates checksums for them. + + Args: + files: List of files. + + Returns: + Iterator over files and their checksums. + """ + for file_path in files: + file_path = normalize_path(file_path) + checksum = get_file_checksum(file_path) + yield file_path, checksum + + +def is_file_exist(s3_client, bucket_name: str, checksum: str) -> bool: + """ + Checks is a file with a given checksum is already uploaded. + Args: + s3_client: Amazon S3 client. + bucket_name: S3 bucket name. + checksum: File checksum. + + Returns: + True if a file is already uploaded, False otherwise. + """ + try: + s3_client.head_object(Bucket=bucket_name, Key=checksum) + return True + except ClientError: + return False + + +def upload_file(s3_client, bucket_name: str, file_path: str, + checksum: str, private: bool): + """ + Uploads a file to an Amazon S3 bucket. + + Args: + s3_client: Amazon S3 client. + bucket_name: S3 bucket name. + file_path: File path. + checksum: File checksum. + private: False if file should be public, True otherwise. + """ + acl = 'bucket-owner-full-control' if private else 'public-read' + s3_client.upload_file(file_path, bucket_name, checksum, + ExtraArgs={'ACL': acl}) + + +def add_metadata_record(metadata_fd: TextIO, file_path: str, checksum: str): + """ + Adds a source file record to a metadata file. + + Args: + metadata_fd: Metadata file descriptor. + file_path: Source file path. + checksum: Source file checksum. + """ + rel_path = os.path.relpath(file_path) + metadata_fd.write(f'{checksum} {rel_path}\n') + + +def main(sys_args): + arg_parser = init_arg_parser() + args = arg_parser.parse_args(sys_args) + s3_client = boto3.client('s3') + if args.input_metadata: + iterator = iter_metadata(args.input_metadata) + else: + iterator = iter_files(args.file) + out_fd = None + if args.output_metadata: + out_fd = open(args.output_metadata, 'w') + try: + for file_path, checksum in iterator: + file_url = f'https://{args.domain_name}/{checksum}' + if not is_file_exist(s3_client, args.bucket, checksum): + upload_file(s3_client, args.bucket, file_path, checksum, + args.private) + print(f'{file_path} uploaded: {file_url}') + else: + print(f'{file_path} exists: {file_url}') + if out_fd: + add_metadata_record(out_fd, file_path, checksum) + finally: + if out_fd: + out_fd.close() + + +if __name__ == '__main__': + sys.exit(main(sys.argv[1:]))