From 913a998265fa6c67efbcb57f42c6d781a2a1eb85 Mon Sep 17 00:00:00 2001 From: Kirill Zhukov Date: Mon, 27 Feb 2023 20:51:53 +0100 Subject: [PATCH] init commit --- .gitignore | 7 ++ build_analitycs/__init__.py | 0 build_analitycs/build_analytics/__init__.py | 0 build_analitycs/build_analytics/api_client.py | 82 +++++++++++++++++ build_analitycs/build_analytics/db.py | 66 ++++++++++++++ .../build_analytics/extractor/__jnit__.py | 0 .../build_analytics/extractor/extractor.py | 45 +++++++++ .../build_analytics/extractor/start.py | 71 +++++++++++++++ .../build_analytics/models/__init__.py | 0 .../build_analytics/models/build.py | 27 ++++++ .../build_analytics/models/build_db.py | 12 +++ .../build_analytics/models/build_task.py | 31 +++++++ .../build_analytics/models/build_task_db.py | 14 +++ .../build_analytics/models/db_config.py | 9 ++ .../build_analytics/models/enums.py | 17 ++++ .../models/extractor_config.py | 24 +++++ .../build_analytics/models/sign_task_db.py | 12 +++ build_analitycs/db_schema/postgres.sql | 91 +++++++++++++++++++ build_analitycs/run_extractor.py | 12 +++ 19 files changed, 520 insertions(+) create mode 100644 .gitignore create mode 100644 build_analitycs/__init__.py create mode 100644 build_analitycs/build_analytics/__init__.py create mode 100644 build_analitycs/build_analytics/api_client.py create mode 100644 build_analitycs/build_analytics/db.py create mode 100644 build_analitycs/build_analytics/extractor/__jnit__.py create mode 100644 build_analitycs/build_analytics/extractor/extractor.py create mode 100644 build_analitycs/build_analytics/extractor/start.py create mode 100644 build_analitycs/build_analytics/models/__init__.py create mode 100644 build_analitycs/build_analytics/models/build.py create mode 100644 build_analitycs/build_analytics/models/build_db.py create mode 100644 build_analitycs/build_analytics/models/build_task.py create mode 100644 build_analitycs/build_analytics/models/build_task_db.py create mode 100644 build_analitycs/build_analytics/models/db_config.py create mode 100644 build_analitycs/build_analytics/models/enums.py create mode 100644 build_analitycs/build_analytics/models/extractor_config.py create mode 100644 build_analitycs/build_analytics/models/sign_task_db.py create mode 100644 build_analitycs/db_schema/postgres.sql create mode 100644 build_analitycs/run_extractor.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..783b8fc --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +venv +logs +*.pyc +__pycache__ +.vscode +private* +debug* \ No newline at end of file diff --git a/build_analitycs/__init__.py b/build_analitycs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build_analitycs/build_analytics/__init__.py b/build_analitycs/build_analytics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build_analitycs/build_analytics/api_client.py b/build_analitycs/build_analytics/api_client.py new file mode 100644 index 0000000..1f59c18 --- /dev/null +++ b/build_analitycs/build_analytics/api_client.py @@ -0,0 +1,82 @@ +from datetime import datetime +import logging + +from urllib.parse import urljoin +from typing import Dict, List + + +from .models.build import Build +from .models.build_task import BuildTask + +import requests + + +TZ_OFFSET = '+00:00' + + +class APIclient(): + """ + client for working with ALBS API + """ + + def __init__(self, api_root: str, jwt: str): + self.api_root = api_root + self.jwt = jwt + + def get_builds(self, page_num: int = 1) -> List[Build]: + ep = '/api/v1/builds' + url = urljoin(self.api_root, ep) + params = {'pageNumber': page_num} + headers = {'accept': 'appilication/json'} + + response = requests.get(url, params=params, headers=headers) + response.raise_for_status() + + result = [] + for b in response.json()['builds']: + try: + result.append(self._parse_build(b)) + except Exception as err: # pylint: disable=broad-except + logging.error("Cant convert build JSON %s to Buildmodel: %s", + b, err, exc_info=True) + return result + + def _parse_build_tasks(self, tasks_json: Dict, build_id: int) -> List[BuildTask]: + result = [] + for task in tasks_json: + try: + started_at = datetime.fromisoformat( + task['started_at']+TZ_OFFSET) \ + if task['started_at'] else None + finished_at = datetime.fromisoformat(task['finished_at']+TZ_OFFSET) \ + if task['finished_at'] else None + params = {'id': task['id'], + 'build_id': build_id, + 'started_at': started_at, + 'finished_at': finished_at, + 'arch': task['arch'], + 'status_id': task['status']} + result.append(BuildTask(**params)) + except Exception as err: # pylint: disable=broad-except + logging.error("Cant convert build_task JSON %s (build_id %s) to BuildTask model: %s", + task, build_id, err, exc_info=True) + + result.sort(key=lambda x: x.id, reverse=True) + return result + + def _parse_build(self, build_json: Dict) -> Build: + url = f"https://build.almalinux.org/build/{build_json['id']}" + created_at = datetime.fromisoformat(build_json['created_at']+TZ_OFFSET) + finished_at = datetime.fromisoformat(build_json['finished_at']+TZ_OFFSET) \ + if build_json['finished_at'] else None + build_tasks = self._parse_build_tasks( + build_json['tasks'], build_json['id']) + + params = { + 'id': build_json['id'], + 'url': url, + 'created_at': created_at, + 'finished_at': finished_at, + 'build_tasks': build_tasks} + + return Build(**params) diff --git a/build_analitycs/build_analytics/db.py b/build_analitycs/build_analytics/db.py new file mode 100644 index 0000000..d5f7055 --- /dev/null +++ b/build_analitycs/build_analytics/db.py @@ -0,0 +1,66 @@ +from datetime import datetime +from typing import Union + +import psycopg2 + +from .models.build_db import BuildDB +from .models.build_task_db import BuildTaskDB +from .models.db_config import DbConfig + + +class DB(): + def __init__(self, config: DbConfig): + self.conf = config + + def _get_conn(self): + conn = psycopg2.connect(database=self.conf.name, + host=self.conf.host, + user=self.conf.username, + password=self.conf.password, + port=self.conf.port) + return conn + + def insert_update_build(self, build: BuildDB): + sql = f''' + INSERT INTO builds(id, url, created_at, finished_at) + VALUES (%s, %s, %s, %s) + ON CONFLICT (id) DO UPDATE SET + (url, created_at, finished_at) = (EXCLUDED.url, EXCLUDED.created_at, EXCLUDED.finished_at); + ''' + with self._get_conn() as conn: + cur = conn.cursor() + cur.execute(sql, (build.id, build.url, + build.created_at, build.finished_at)) + conn.commit() + + def insert_update_buildtask(self, build_task: BuildTaskDB): + sql = f""" + INSERT INTO build_tasks(id, build_id, arch_id, started_at, finished_at, status_id) + VALUES (%s, %s, %s, %s, %s, %s) + ON CONFLICT (id) DO UPDATE SET + (id, build_id, arch_id, started_at, finished_at, status_id) = (EXCLUDED.ID, EXCLUDED.build_id, EXCLUDED.arch_id, EXCLUDED.started_at, EXCLUDED.finished_at, EXCLUDED.status_id) + """ + with self._get_conn() as conn: + cur = conn.cursor() + cur.execute(sql, (build_task.id, build_task.build_id, build_task.arch_id, + build_task.started_at, build_task.finished_at, build_task.status_id)) + conn.commit() + + def get_latest_build_id(self) -> Union[int, None]: + sql = "SELECT id from builds ORDER BY id DESC LIMIT 1;" + with self._get_conn() as conn: + cur = conn.cursor() + cur.execute(sql) + val = cur.fetchone() + if not val: + return None + return int(val[0]) + + def cleanup_builds(self, oldest_to_keep: datetime) -> int: + params = (int(oldest_to_keep.timestamp()),) + sql = "DELETE FROM builds WHERE created_at < %s;" + with self._get_conn() as conn: + cur = conn.cursor() + cur.execute(sql, params) + conn.commit() + return cur.rowcount diff --git a/build_analitycs/build_analytics/extractor/__jnit__.py b/build_analitycs/build_analytics/extractor/__jnit__.py new file mode 100644 index 0000000..e69de29 diff --git a/build_analitycs/build_analytics/extractor/extractor.py b/build_analitycs/build_analytics/extractor/extractor.py new file mode 100644 index 0000000..efa7d57 --- /dev/null +++ b/build_analitycs/build_analytics/extractor/extractor.py @@ -0,0 +1,45 @@ +import logging + +from ..models.extractor_config import ExtractorConfig +from ..api_client import APIclient +from ..db import DB + + +class Extractor: + def __init__(self, config: ExtractorConfig, api: APIclient, db: DB): + self.oldest_build_age = config.oldest_build_age + self.api = api + self.db = db + + def extract_and_store(self) -> int: + build_count = 0 + page_num = 1 + last_build_id = self.db.get_latest_build_id() + if not last_build_id: + last_build_id = 0 + logging.info(f"last_build_id: {last_build_id}") + stop = False + + while not stop: + logging.info(f"page: {page_num}") + for build in self.api.get_builds(page_num): + # check if we shoud stop processing build + if build.id <= last_build_id or \ + build.created_at <= self.oldest_build_age: + stop = True + break + + # inserting build and build tasks + logging.info(f"inserting {build.id}") + self.db.insert_update_build(build.as_db_model()) + for build_task in build.build_tasks: + self.db.insert_update_buildtask(build_task.as_db_model()) + build_count += 1 + page_num += 1 + return build_count + + def build_cleanup(self): + logging.info('Removing all buidls older then %s', + self.oldest_build_age.strftime("%m/%d/%Y, %H:%M:%S")) + removed_count = self.db.cleanup_builds(self.oldest_build_age) + logging.info('removed %d entries', removed_count) diff --git a/build_analitycs/build_analytics/extractor/start.py b/build_analitycs/build_analytics/extractor/start.py new file mode 100644 index 0000000..6319470 --- /dev/null +++ b/build_analitycs/build_analytics/extractor/start.py @@ -0,0 +1,71 @@ +from datetime import datetime, timedelta +import logging +from logging.handlers import RotatingFileHandler + +import yaml + +from ..api_client import APIclient +from ..db import DB +from .extractor import Extractor +from ..models.extractor_config import ExtractorConfig +from ..models.db_config import DbConfig + + +def __get_oldest_build_age(config: dict) -> datetime: + oldest_build_age = datetime.now().astimezone() \ + - timedelta(days=config['data_store_days']) + return oldest_build_age + + +def __get_db_config(config: dict) -> DbConfig: + return DbConfig(name=config['db_name'], + port=int(config['db_port']), + host=config['db_host'], + username=config['db_username'], + password=config['db_password']) + + +def __get_config(yml_path: str) -> ExtractorConfig: + """ + get_config loads yml file and generates instance + """ + with open(yml_path, 'r', encoding='utf-8') as flr: + raw = yaml.safe_load(flr) + + # adding new attrs + raw['oldest_build_age'] = __get_oldest_build_age(raw) + raw['db_config'] = __get_db_config(raw) + + return ExtractorConfig(**raw) + + +def start(yml_path: str): + config = __get_config(yml_path) + + # configuring logging + logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(levelname)s <%(funcName)s> %(message)s', + handlers=[RotatingFileHandler(config.log_file, + maxBytes=10000000, + backupCount=3)]) + + api = APIclient(api_root=config.albs_url, jwt=config.jwt) + db = DB(config.db_config) + extractor = Extractor(config, api, db) + logging.info('Starting builds insertion') + inserted_count = -1 + try: + inserted_count = extractor.extract_and_store() + except Exception as err: # pylint: disable=broad-except + logging.critical("Unhandled exeption %s", err, exc_info=True) + else: + logging.info( + 'Build extaction was finished. %d builds were inserted', inserted_count) + + logging.info('Starting old builds removal') + try: + extractor.build_cleanup() + except Exception as err: + logging.critical("Unhandled exeption %s", err, exc_info=True) + else: + logging.info('Cleanup finished') diff --git a/build_analitycs/build_analytics/models/__init__.py b/build_analitycs/build_analytics/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build_analitycs/build_analytics/models/build.py b/build_analitycs/build_analytics/models/build.py new file mode 100644 index 0000000..878d113 --- /dev/null +++ b/build_analitycs/build_analytics/models/build.py @@ -0,0 +1,27 @@ +from datetime import datetime +from typing import List, Optional +from pydantic import BaseModel, HttpUrl +from typing import Any + +from .build_task import BuildTask +from .build_db import BuildDB + + +class Build(BaseModel): + id: int + url: HttpUrl + build_tasks: List[BuildTask] + created_at: datetime + finished_at: Optional[datetime] = None + + def as_db_model(self) -> BuildDB: + created_at = self.created_at.timestamp() + finished_at = self.finished_at.timestamp() \ + if self.finished_at else None + params = { + 'id': self.id, + 'url': self.url, + 'created_at': created_at, + 'finished_at': finished_at + } + return BuildDB(**params) diff --git a/build_analitycs/build_analytics/models/build_db.py b/build_analitycs/build_analytics/models/build_db.py new file mode 100644 index 0000000..3bd7489 --- /dev/null +++ b/build_analitycs/build_analytics/models/build_db.py @@ -0,0 +1,12 @@ +from typing import Optional, Dict, Any +from pydantic import BaseModel, HttpUrl + + +class BuildDB(BaseModel): + """ + Build as it received from/sent to database + """ + id: int + url: HttpUrl + created_at: int + finished_at: Optional[float] = None diff --git a/build_analitycs/build_analytics/models/build_task.py b/build_analitycs/build_analytics/models/build_task.py new file mode 100644 index 0000000..303e629 --- /dev/null +++ b/build_analitycs/build_analytics/models/build_task.py @@ -0,0 +1,31 @@ +from datetime import datetime +from typing import Optional + +from pydantic import BaseModel + +from .build_task_db import BuildTaskDB +from .enums import ArchEnum + + +class BuildTask(BaseModel): + id: int + build_id: int + arch: str + started_at: Optional[datetime] = None + finished_at: Optional[datetime] = None + status_id: int + + def as_db_model(self) -> BuildTaskDB: + started_at = self.started_at.timestamp() \ + if self.started_at else None + finished_at = self.finished_at.timestamp() \ + if self.finished_at else None + params = { + 'id': self.id, + 'build_id': self.build_id, + 'arch_id': ArchEnum[self.arch].value, + 'started_at': started_at, + 'finished_at': finished_at, + 'status_id': self.status_id + } + return BuildTaskDB(**params) diff --git a/build_analitycs/build_analytics/models/build_task_db.py b/build_analitycs/build_analytics/models/build_task_db.py new file mode 100644 index 0000000..16ccb13 --- /dev/null +++ b/build_analitycs/build_analytics/models/build_task_db.py @@ -0,0 +1,14 @@ +from typing import Optional +from pydantic import BaseModel + + +class BuildTaskDB(BaseModel): + """ + BuildTask as it received from/sent to database + """ + id: int + build_id: int + arch_id: int + started_at: Optional[float] = None + finished_at: Optional[float] = None + status_id: int diff --git a/build_analitycs/build_analytics/models/db_config.py b/build_analitycs/build_analytics/models/db_config.py new file mode 100644 index 0000000..24c5efd --- /dev/null +++ b/build_analitycs/build_analytics/models/db_config.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel, Field + + +class DbConfig(BaseModel): + name: str = Field(description="db name") + port: int = Field(description="db server port") + host: str = Field(description="db server ip/hostname") + username: str = Field(description="username to connect with") + password: str = Field(description="password to connect with1") diff --git a/build_analitycs/build_analytics/models/enums.py b/build_analitycs/build_analytics/models/enums.py new file mode 100644 index 0000000..5b18fdf --- /dev/null +++ b/build_analitycs/build_analytics/models/enums.py @@ -0,0 +1,17 @@ +from enum import IntEnum + + +class ArchEnum(IntEnum): + i686 = 0 + x86_64 = 1 + aarch64 = 2 + ppc64le = 3 + s390x = 4 + + +class BuildTaskEnum(IntEnum): + idle = 0 + started = 1 + completed = 2 + failed = 3 + excluded = 4 diff --git a/build_analitycs/build_analytics/models/extractor_config.py b/build_analitycs/build_analytics/models/extractor_config.py new file mode 100644 index 0000000..865f00e --- /dev/null +++ b/build_analitycs/build_analytics/models/extractor_config.py @@ -0,0 +1,24 @@ +from datetime import datetime +from pathlib import Path + +from pydantic import HttpUrl, Field, BaseModel + +from .db_config import DbConfig + +# DEFAULTS +ALBS_URL_DEFAULT = 'https://build.almalinux.org' +LOG_FILE_DEFAULT = '/tmp/extractor.log' + + +class ExtractorConfig(BaseModel): + """ + config model for Extractor service + """ + log_file: Path = Field(description='logfile path', + default=LOG_FILE_DEFAULT) + albs_url: HttpUrl = Field(description='ALBS root URL', + default=ALBS_URL_DEFAULT) + oldest_build_age: datetime = \ + Field(description='oldest build age to extract and store') + jwt: str = Field(description='ALBS JWT token') + db_config: DbConfig = Field(description="database configuration") diff --git a/build_analitycs/build_analytics/models/sign_task_db.py b/build_analitycs/build_analytics/models/sign_task_db.py new file mode 100644 index 0000000..60281f2 --- /dev/null +++ b/build_analitycs/build_analytics/models/sign_task_db.py @@ -0,0 +1,12 @@ +from pydantic import BaseModel + + +class SignTaskDB(BaseModel): + """ + SignTaskDB as it received from/sent to database + """ + id: int + build_id: int + build_task_id: int + started_at: int + finished_at: int diff --git a/build_analitycs/db_schema/postgres.sql b/build_analitycs/db_schema/postgres.sql new file mode 100644 index 0000000..cb44990 --- /dev/null +++ b/build_analitycs/db_schema/postgres.sql @@ -0,0 +1,91 @@ +-- builds +DROP TABLE IF EXISTS builds CASCADE; +CREATE TABLE builds ( + id INTEGER PRIMARY KEY, + url VARCHAR(50) NOT NULL, + created_at REAL NOT NULL, + finished_at REAL +); + + +CREATE INDEX IF NOT EXISTS builds_created_at +ON builds(created_at); + +CREATE INDEX IF NOT EXISTS builds_finished_at +ON builds(finished_at); + + +-- build_taks_enum +DROP TABLE IF EXISTS build_task_enum CASCADE; +CREATE TABLE IF NOT EXISTS build_task_enum( + id INTEGER PRIMARY KEY, + value VARCHAR(15) +); + +INSERT INTO build_task_enum (id, value) +VALUES + (0, 'idle'), + (1, 'started'), + (2, 'completed'), + (3, 'failed'), + (4, 'excluded'); + + +-- arch_enum +DROP TABLE IF EXISTS arch_enum CASCADE; +CREATE TABLE arch_enum( + id INTEGER PRIMARY KEY, + value VARCHAR(15) +); + +INSERT INTO arch_enum(id, value) +VALUES + (0, 'i686'), + (1, 'x86_64'), + (2, 'aarch64'), + (3, 'ppc64le'), + (4, 's390x'); + + +-- build_tasks +DROP TABLE IF EXISTS build_tasks CASCADE; +CREATE TABLE build_tasks ( + id INTEGER PRIMARY KEY, + build_id INTEGER REFERENCES builds(id) ON DELETE CASCADE, + arch_id INTEGER REFERENCES arch_enum(id) ON DELETE SET NULL, + status_id INTEGER REFERENCES build_task_enum(id) ON DELETE SET NULL, + started_at REAL, + finished_at REAL +); + +CREATE INDEX build_tasks_build_id +ON build_tasks(build_id); + +CREATE INDEX build_tasks_started_at +ON build_tasks(started_at); + +CREATE INDEX build_tasks_finished_at +ON build_tasks(finished_at); + + +-- sign_tasks +DROP TABLE IF EXISTS sign_tasks CASCADE; +CREATE TABLE sign_tasks ( + id INTEGER PRIMARY KEY, + build_id INTEGER REFERENCES builds(id) ON DELETE CASCADE, + buildtask_id INTEGER REFERENCES build_tasks(id) ON DELETE CASCADE, + started_at REAL, + finished_at REAL +); + +CREATE INDEX sign_tasks_build_id +ON sign_tasks(build_id); + +CREATE INDEX sign_tasks_buildtask_id +ON sign_tasks(buildtask_id); + +CREATE INDEX sing_tasks_started_at +ON sign_tasks(started_at); + +CREATE INDEX sign_tasks_finished_at +ON sign_tasks(finished_at); diff --git a/build_analitycs/run_extractor.py b/build_analitycs/run_extractor.py new file mode 100644 index 0000000..46da11b --- /dev/null +++ b/build_analitycs/run_extractor.py @@ -0,0 +1,12 @@ +""" +extractor startup script +""" +import sys +from build_analytics.extractor.start import start + +try: + YAML_PATH = sys.argv[1] +except IndexError: + print(f"Usage: {sys.argv[0]} config.yml") + sys.exit(1) +start(YAML_PATH)