0.0.1 version

- added alma_get_sources tool
  - added tests for common functions
  - added setup.py and the project uploaded to PyPi
This commit is contained in:
Eugene Zamriy 2021-08-25 17:10:58 +03:00
parent 896ea1f1ba
commit 804cca076e
Signed by: ezamriy
GPG Key ID: 5B659D38AF0CB46F
9 changed files with 570 additions and 207 deletions

View File

@ -3,6 +3,24 @@
Utilities for working with the AlmaLinux OS Git server. Utilities for working with the AlmaLinux OS Git server.
## alma_get_sources
The `alma_get_sources` script downloads sources and BLOBs from the AlmaLinux
sources cache.
### Usage
Run the `alma_get_sources` in a git project root directory:
1. Clone an AlmaLinux RPM package git project from
[git.almalinux.org](https://git.almalinux.org).
2. Switch to a required branch.
3. Run the `alma_get_sources` tool:
```shell
$ alma_get_sources
```
## alma_blob_upload ## alma_blob_upload
The `alma_blob_upload` script uploads sources and BLOBs to the AlmaLinux The `alma_blob_upload` script uploads sources and BLOBs to the AlmaLinux
@ -10,16 +28,6 @@ sources cache.
### Prerequirements ### Prerequirements
Install the `python3-boto3` package:
```shell
# RPM-based distributions. On EL8 derivatives the package is available from EPEL.
$ sudo dnf install python3 python3-boto3
# Debian-based distributions
$ sudo apt install python3-boto3
```
Create an AWS credentials file ~/.aws/credentials with the following content: Create an AWS credentials file ~/.aws/credentials with the following content:
```ini ```ini
@ -43,20 +51,20 @@ For CentOS repositories workflow will be the following:
3. Run the `alma_blob_upload` tool (don't forget to replace `PROJECT_NAME` with 3. Run the `alma_blob_upload` tool (don't forget to replace `PROJECT_NAME` with
an actual project name): an actual project name):
```shell ```shell
$ alma_blob_upload.py -i .PROJECT_NAME.metadata $ alma_blob_upload -i .PROJECT_NAME.metadata
``` ```
Alternatively, you can upload a list of files in the following way: Alternatively, you can upload a list of files in the following way:
```shell ```shell
$ alma_blob_upload.py -f SOURCES/FILE_1 SOURCES/FILE_N $ alma_blob_upload -f SOURCES/FILE_1 SOURCES/FILE_N
``` ```
The `alma_blob_upload` utility can also generate a CentOS-compatible metadata The `alma_blob_upload` utility can also generate a CentOS-compatible metadata
file: file:
``` ```shell
$ alma_blob_upload.py -o .PROJECT_NAME.metadata -f SOURCES/FILE_1 SOURCES/FILE_N $ alma_blob_upload -o .PROJECT_NAME.metadata -f SOURCES/FILE_1 SOURCES/FILE_N
``` ```

View File

@ -0,0 +1 @@
name = 'gitutils'

View File

@ -1,193 +1,156 @@
#!/usr/bin/env python3 """Uploads sources and BLOBs to the AlmaLinux sources cache"""
""" import argparse
Uploads sources and BLOBs to the AlmaLinux sources cache. import logging
""" import os
import sys
import argparse from typing import Iterator, List, Optional, Tuple
import hashlib
import os.path import boto3
import sys import botocore.exceptions
from typing import Iterator, List, TextIO, Tuple
from almalinux.gitutils.errors import ChecksumError
import boto3 # type: ignore from almalinux.gitutils.common import (
from botocore.exceptions import ClientError # type: ignore configure_logger, find_metadata_file, get_file_checksum, iter_metadata,
normalize_path
)
def init_arg_parser() -> argparse.ArgumentParser:
"""
Initializes a command line arguments parser. def init_arg_parser() -> argparse.ArgumentParser:
"""
Returns: Initializes a command line arguments parser.
Command line arguments parser.
""" Returns:
arg_parser = argparse.ArgumentParser( Command line arguments parser.
prog="alma_blob_upload", """
description="Uploads sources and BLOBs to the AlmaLinux sources cache" arg_parser = argparse.ArgumentParser(
) prog="alma_blob_upload",
group = arg_parser.add_mutually_exclusive_group(required=True) description="Uploads sources and BLOBs to the AlmaLinux sources cache"
group.add_argument('-f', '--file', nargs='+', help='file(s) to upload') )
group.add_argument('-i', '--input-metadata', metavar='INPUT_FILE', group = arg_parser.add_mutually_exclusive_group()
help='input metadata file list to upload') group.add_argument('-f', '--file', nargs='+', help='file(s) to upload')
arg_parser.add_argument('-b', '--bucket', default="sources.almalinux.org", group.add_argument('-i', '--input-metadata', metavar='INPUT_FILE',
help='Amazon S3 bucket name. Default is ' help='input metadata file list to upload. Will be '
'sources.almalinux.org') 'detected automatically if omitted and no files '
arg_parser.add_argument('-o', '--output-metadata', metavar='OUTPUT_FILE', 'provided')
help='output metadata file path') arg_parser.add_argument('-b', '--bucket', default="sources.almalinux.org",
arg_parser.add_argument('-p', '--private', action='store_true', help='Amazon S3 bucket name. Default is '
help='set uploaded file mode to private. All ' 'sources.almalinux.org')
'uploaded files are public by default') arg_parser.add_argument('-o', '--output-metadata', metavar='OUTPUT_FILE',
arg_parser.add_argument('--domain-name', default='sources.almalinux.org', help='output metadata file path')
help='AlmaLinux sources server domain name. ' arg_parser.add_argument('-p', '--private', action='store_true',
'Default is sources.almalinux.org') help='set uploaded file mode to private. All '
arg_parser.add_argument('-v', '--verbose', action='store_true', 'uploaded files are public by default')
help='enable additional debug output') arg_parser.add_argument('--domain-name', default='sources.almalinux.org',
return arg_parser help='AlmaLinux sources server domain name. '
'Default is sources.almalinux.org')
arg_parser.add_argument('-v', '--verbose', action='store_true',
def get_file_checksum(file_path: str, checksum_type: str = 'sha1', help='enable additional debug output')
buff_size: int = 1048576) -> str: return arg_parser
"""
Calculates a file checksum.
def iter_files(files: List[str]) -> Iterator[Tuple[str, str, str]]:
Args: """
file_path: File path. Iterates over a list of files and calculates checksums for them.
checksum_type: Checksum type.
buff_size: Number of bytes to read at once. Args:
files: List of files.
Returns:
File checksum. Returns:
""" Iterator over files and their checksums.
hasher = hashlib.new(checksum_type) """
with open(file_path, 'rb') as fd: checksum_type = 'sha1'
buff = fd.read(buff_size) for rel_path in files:
while len(buff): file_path = normalize_path(rel_path)
hasher.update(buff) checksum = get_file_checksum(file_path, checksum_type)
buff = fd.read(buff_size) yield rel_path, checksum, checksum_type
return hasher.hexdigest()
def is_file_exist(s3_client, bucket_name: str, checksum: str) -> bool:
def normalize_path(path: str) -> str: """
""" Checks is a file with a given checksum is already uploaded.
Returns an absolute path with all variables expanded. Args:
s3_client: Amazon S3 client.
Args: bucket_name: S3 bucket name.
path: Path to normalize. checksum: File checksum.
Returns: Returns:
Normalized path. True if a file is already uploaded, False otherwise.
""" """
return os.path.abspath(os.path.expanduser(os.path.expandvars(path))) try:
s3_client.head_object(Bucket=bucket_name, Key=checksum)
return True
def iter_metadata(metadata_path: str) -> Iterator[Tuple[str, str]]: except botocore.exceptions.ClientError:
""" return False
Iterates over records in a CentOS git repository-compatible metadata file.
Args: def upload_file(s3_client, bucket_name: str, file_path: str,
metadata_path: Metadata file path. checksum: str, private: bool):
"""
Returns: Uploads a file to an Amazon S3 bucket.
Iterator over files and their checksums.
""" Args:
with open(metadata_path, 'r') as fd: s3_client: Amazon S3 client.
for line in fd: bucket_name: S3 bucket name.
checksum, file_path = line.split() file_path: File path.
file_path = normalize_path(file_path) checksum: File checksum.
assert checksum == get_file_checksum(file_path) private: False if file should be public, True otherwise.
yield file_path, checksum """
acl = 'bucket-owner-full-control' if private else 'public-read'
s3_client.upload_file(file_path, bucket_name, checksum,
def iter_files(files: List[str]) -> Iterator[Tuple[str, str]]: ExtraArgs={'ACL': acl})
"""
Iterates over a list of files and calculates checksums for them.
def get_file_iterator(
Args: files: List[str], metadata_path: Optional[str]
files: List of files. ) -> Iterator[Tuple[str, str, str]]:
"""
Returns: Finds a suitable file iterator for given arguments.
Iterator over files and their checksums.
""" Args:
for file_path in files: files: List of files.
file_path = normalize_path(file_path) metadata_path: Metadata file path.
checksum = get_file_checksum(file_path)
yield file_path, checksum Returns:
File iterator.
"""
def is_file_exist(s3_client, bucket_name: str, checksum: str) -> bool: if files:
""" iterator = iter_files(files)
Checks is a file with a given checksum is already uploaded. else:
Args: if not metadata_path:
s3_client: Amazon S3 client. metadata_path = find_metadata_file(os.getcwd())
bucket_name: S3 bucket name. iterator = iter_metadata(metadata_path)
checksum: File checksum. return iterator
Returns:
True if a file is already uploaded, False otherwise. def main():
""" arg_parser = init_arg_parser()
try: args = arg_parser.parse_args(sys.argv[1:])
s3_client.head_object(Bucket=bucket_name, Key=checksum) configure_logger(args.verbose)
return True s3_client = boto3.client('s3')
except ClientError: iterator = get_file_iterator(args.file, args.input_metadata)
return False out_fd = None
if args.output_metadata:
out_fd = open(args.output_metadata, 'w')
def upload_file(s3_client, bucket_name: str, file_path: str, try:
checksum: str, private: bool): for rel_path, checksum, checksum_type in iterator:
""" file_path = normalize_path(rel_path)
Uploads a file to an Amazon S3 bucket. if not args.file:
real_checksum = get_file_checksum(file_path, checksum_type)
Args: if real_checksum != checksum:
s3_client: Amazon S3 client. raise ChecksumError(
bucket_name: S3 bucket name. f"{rel_path} {checksum_type} checksum {real_checksum} "
file_path: File path. f"doesn't match expected {checksum}"
checksum: File checksum. )
private: False if file should be public, True otherwise. file_url = f'https://{args.domain_name}/{checksum}'
""" if not is_file_exist(s3_client, args.bucket, checksum):
acl = 'bucket-owner-full-control' if private else 'public-read' upload_file(s3_client, args.bucket, file_path, checksum,
s3_client.upload_file(file_path, bucket_name, checksum, args.private)
ExtraArgs={'ACL': acl}) logging.info(f'{rel_path} successfully uploaded: {file_url}')
else:
logging.info(f'{rel_path} is already uploaded: {file_url}')
def add_metadata_record(metadata_fd: TextIO, file_path: str, checksum: str): if out_fd:
""" out_fd.write(f'{checksum} {rel_path}\n')
Adds a source file record to a metadata file. finally:
if out_fd:
Args: out_fd.close()
metadata_fd: Metadata file descriptor.
file_path: Source file path.
checksum: Source file checksum.
"""
rel_path = os.path.relpath(file_path)
metadata_fd.write(f'{checksum} {rel_path}\n')
def main(sys_args):
arg_parser = init_arg_parser()
args = arg_parser.parse_args(sys_args)
s3_client = boto3.client('s3')
if args.input_metadata:
iterator = iter_metadata(args.input_metadata)
else:
iterator = iter_files(args.file)
out_fd = None
if args.output_metadata:
out_fd = open(args.output_metadata, 'w')
try:
for file_path, checksum in iterator:
file_url = f'https://{args.domain_name}/{checksum}'
if not is_file_exist(s3_client, args.bucket, checksum):
upload_file(s3_client, args.bucket, file_path, checksum,
args.private)
print(f'{file_path} uploaded: {file_url}')
else:
print(f'{file_path} exists: {file_url}')
if out_fd:
add_metadata_record(out_fd, file_path, checksum)
finally:
if out_fd:
out_fd.close()
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))

View File

@ -0,0 +1,120 @@
"""AlmaLinux Git server utilities common functions"""
import hashlib
import logging
import os
import re
from typing import Iterator, Tuple
__all__ = ['configure_logger', 'detect_checksum_type', 'find_metadata_file',
'get_file_checksum', 'iter_metadata', 'normalize_path']
def configure_logger(verbose: bool) -> logging.Logger:
"""
Configures a console logger.
Args:
verbose: Show DEBUG messages if True, show INFO and higher otherwise.
Returns:
Configured logger.
"""
level = logging.DEBUG if verbose else logging.INFO
handler = logging.StreamHandler()
handler.setLevel(level)
log_format = "%(levelname)-8s: %(message)s"
formatter = logging.Formatter(log_format, '%y-%m-%d %H:%M:%S')
handler.setFormatter(formatter)
logger = logging.getLogger()
logger.addHandler(handler)
logger.setLevel(level)
return logger
def detect_checksum_type(checksum: str) -> str:
"""
Detects checksum by its length.
Args:
checksum: Checksum.
Returns:
Checksum type.
"""
hash_types = {32: 'md5', 40: 'sha1', 64: 'sha256', 128: 'sha512'}
hash_type = hash_types.get(len(checksum))
if not hash_type:
raise ValueError(f'unknown checksum type {checksum}')
return hash_type
def find_metadata_file(path: str) -> str:
"""
Finds a sources metadata file in the specified directory.
Args:
path: Directory to search in.
Returns:
Sources metadata file path.
"""
files = [f for f in os.listdir(path) if re.match(r'^\.\S*?\.metadata$', f)]
if not files:
raise Exception('metadata file is not found')
elif len(files) > 1:
raise Exception('multiple metadata files found. Please specify one to '
'use')
return os.path.join(path, files[0])
def get_file_checksum(file_path: str, checksum_type: str = 'sha1',
buff_size: int = 1048576) -> str:
"""
Calculates a file checksum.
Args:
file_path: File path.
checksum_type: Checksum type.
buff_size: Number of bytes to read at once.
Returns:
File checksum.
"""
hasher = hashlib.new(checksum_type)
with open(file_path, 'rb') as fd:
buff = fd.read(buff_size)
while len(buff):
hasher.update(buff)
buff = fd.read(buff_size)
return hasher.hexdigest()
def iter_metadata(metadata_path: str) -> Iterator[Tuple[str, str, str]]:
"""
Iterates over records in a CentOS git repository-compatible metadata file.
Args:
metadata_path: Metadata file path.
Returns:
Iterator over files and their checksums.
"""
with open(metadata_path, 'r') as fd:
for line in fd:
checksum, file_path = line.split()
checksum_type = detect_checksum_type(checksum)
yield file_path, checksum, checksum_type
def normalize_path(path: str) -> str:
"""
Returns an absolute path with all variables expanded.
Args:
path: Path to normalize.
Returns:
Normalized path.
"""
return os.path.abspath(os.path.expanduser(os.path.expandvars(path)))

View File

@ -0,0 +1,15 @@
"""AlmaLinux Git server utilities error classes"""
class ChecksumError(Exception):
"""File checksum mismatch exception"""
pass
class NetworkError(Exception):
"""Network error exception"""
pass

View File

@ -0,0 +1,120 @@
"""Downloads sources and blobs from AlmaLinux or CentOS sources cache"""
import argparse
import logging
import os
import shutil
import sys
import requests
from almalinux.gitutils.common import (
configure_logger, find_metadata_file, get_file_checksum, iter_metadata,
normalize_path
)
from almalinux.gitutils.errors import ChecksumError, NetworkError
def init_arg_parser() -> argparse.ArgumentParser:
"""
Initializes a command line arguments parser.
Returns:
Command line arguments parser.
"""
arg_parser = argparse.ArgumentParser(prog='alma_get_sources',
description=__doc__)
arg_parser.add_argument('-i', '--input-metadata', metavar='INPUT_FILE',
help='input metadata file list to download')
arg_parser.add_argument('--domain-name', default='sources.almalinux.org',
help='AlmaLinux sources server domain name. '
'Default is sources.almalinux.org')
arg_parser.add_argument('-v', '--verbose', action='store_true',
help='enable additional debug output')
return arg_parser
def create_sources_dir(base_dir: str, rel_path: str):
"""
Creates a sources directory if it doesn't exist.
Args:
base_dir: Project's base directory.
rel_path: Project's source file relative path.
"""
dir_name, file_name = os.path.split(rel_path)
dir_path = os.path.join(base_dir, dir_name)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
def download_alma_blob(file_path: str, checksum: str, domain_name: str):
"""
Downloads a BLOB from the AlmaLinux Git sources cache.
Args:
file_path: Destination file path.
checksum: File checksum.
domain_name: AlmaLinux Git source cache domain name.
"""
url = f'https://{domain_name}/{checksum}'
with requests.get(url, stream=True) as rqst:
try:
rqst.raise_for_status()
except requests.exceptions.HTTPError as e:
raise NetworkError(str(e))
with open(file_path, 'wb') as fd:
shutil.copyfileobj(rqst.raw, fd)
def download_metadata_blobs(metadata_path: str, base_dir: str,
domain_name: str):
"""
Downloads BLOBs listed in a metadata file from AlmaLinux Git sources cache.
Args:
metadata_path: Metadata file path.
base_dir: Package sources base directory.
domain_name: AlmaLinux Git sources cache domain name.
"""
for rel_path, checksum, checksum_type in iter_metadata(metadata_path):
file_path = os.path.join(base_dir, rel_path)
if os.path.exists(file_path):
real_checksum = get_file_checksum(file_path, checksum_type)
if real_checksum != checksum:
raise ChecksumError(
f"{rel_path} already exists but its {checksum_type} "
f"checksum {real_checksum} doesn't match expected "
f"{checksum}"
)
logging.info(f'{rel_path} already exists and its checksum is '
f'correct')
continue
create_sources_dir(base_dir, rel_path)
download_alma_blob(file_path, checksum, domain_name)
real_checksum = get_file_checksum(file_path, checksum_type)
if real_checksum != checksum:
raise ChecksumError(
f"{rel_path} has been downloaded but its {checksum_type} "
f"checksum {real_checksum} doesn't match expected {checksum}"
)
logging.info(f'{rel_path} has been successfully downloaded')
def main():
arg_parser = init_arg_parser()
args = arg_parser.parse_args(sys.argv[1:])
configure_logger(args.verbose)
base_dir = os.getcwd()
if args.input_metadata:
metadata_path = normalize_path(args.input_metadata)
else:
metadata_path = find_metadata_file(base_dir)
try:
download_metadata_blobs(metadata_path, base_dir, args.domain_name)
except ChecksumError as e:
logging.error(e)
return os.EX_DATAERR
except NetworkError as e:
logging.error(e)
return os.EX_IOERR

6
pyproject.toml Normal file
View File

@ -0,0 +1,6 @@
[build-system]
requires = [
"setuptools>=42",
"wheel"
]
build-backend = "setuptools.build_meta"

36
setup.py Normal file
View File

@ -0,0 +1,36 @@
from setuptools import find_namespace_packages, setup
with open("README.md", "r", encoding="utf-8") as fd:
long_description = fd.read()
setup(
name="almalinux-git-utils",
version="0.0.1",
author="Eugene Zamriy",
author_email="ezamriy@almalinux.org",
description="Utilities for working with the AlmaLinux OS Git server",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://git.almalinux.org/almalinux/almalinux-git-utils",
project_urls={
"Bug Tracker": "https://git.almalinux.org/almalinux/almalinux-git-utils/issues",
},
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
"Operating System :: OS Independent",
],
packages=find_namespace_packages(include=['almalinux.*']),
entry_points={
'console_scripts': [
'alma_blob_upload=almalinux.gitutils.blob_upload:main',
'alma_get_sources=almalinux.gitutils.get_sources:main'
]
},
install_requires=[
'boto3>=1.15.15',
'requests>=2.20.0'
],
python_requires=">=3.6",
zip_safe=False
)

View File

@ -0,0 +1,94 @@
import os
import pytest
from almalinux.gitutils.common import *
@pytest.mark.parametrize(
'checksum,expected',
[('35d14f5ab4ee239b070f3b645fb82837', 'md5'),
('1014c8812720619a5a6bcd189e5d7f5d16276d86', 'sha1'),
('86d8a9a32cdaff2c6003c67a12549466319e0ae51b7665fd01fd9354a3b1cf55',
'sha256'),
('9906e61ef0b693bf978e2a88b737c79dd2c815cfc1a09443f04b79b994b4646ff72f18'
'6e42461b3a5768667119f39fa006ce71530791a5b35c2278e9252ec3ea', 'sha512')]
)
def test_detect_checksum_type(checksum, expected):
"""detect_checksum_type returns type for supported checksums"""
assert detect_checksum_type(checksum) == expected
def test_detect_checksum_type_error():
"""detect_checksum_type raises ValueError if checksum type is unknown"""
with pytest.raises(ValueError):
detect_checksum_type('somethingwrong')
@pytest.mark.parametrize(
'checksum_type,checksum',
[(None, '06364afe79d801433188262478a76d19777ef351'),
('sha1', '06364afe79d801433188262478a76d19777ef351'),
('sha256', 'b37758528c0338d529b3fb16fd39f28da58241abc856e16bf0bc8b99c60cd632')]
)
def test_get_file_checksum(tmpdir, checksum_type, checksum):
"""get_file_checksum supports different checksum types"""
file_path = os.path.join(tmpdir, 'test_file.txt')
with open(file_path, 'w') as fd:
fd.write('TESTDATA\n')
args = [file_path]
if checksum_type:
args.append(checksum_type)
assert get_file_checksum(*args) == checksum
def test_find_metadata_file_single(tmpdir):
"""find_metadata_file returns a single metadata file"""
file_path = os.path.join(tmpdir, '.project.metadata')
open(file_path, 'a').close()
assert find_metadata_file(tmpdir) == file_path
def test_find_metadata_file_missing(tmpdir):
"""find_metadata_file raises Exception when metadata file is not found"""
with pytest.raises(Exception):
find_metadata_file(tmpdir)
def test_find_metadata_file_multiple(tmpdir):
"""
find_metadata_file raises Exception when there are multiple metadata files
"""
for i in range(2):
open(os.path.join(tmpdir, f'.project{i}.metadata'), 'a').close()
with pytest.raises(Exception):
find_metadata_file(tmpdir)
def test_iter_metadata(tmpdir):
"""iter_metadata returns checksums from metadata file"""
data = [
('SOURCES/mc-4.8.19.tar.xz',
'850747ae43a5c81f1dd0d906dfa9e149eb19748a', 'sha1'),
('SOURCES/binary-blob',
'b37758528c0338d529b3fb16fd39f28da58241abc856e16bf0bc8b99c60cd632',
'sha256')
]
metadata_path = os.path.join(tmpdir, '.project.metadata')
with open(metadata_path, 'w') as fd:
for rec in data:
fd.write(f'{rec[1]} {rec[0]}\n')
metadata = []
for file_path, checksum, checksum_type in iter_metadata(metadata_path):
metadata.append((file_path, checksum, checksum_type))
assert metadata == data
def test_normalize_path(monkeypatch):
"""
normalize_path expands variables and converts relative paths to absolute
"""
cwd = os.getcwd()
expected = os.path.join(cwd, 'basedir', 'subdir')
monkeypatch.setenv('BASE_DIR', 'basedir')
assert normalize_path('${BASE_DIR}/subdir') == expected