init commit

This commit is contained in:
Kirill Zhukov 2023-02-27 20:51:53 +01:00
commit 913a998265
19 changed files with 520 additions and 0 deletions

7
.gitignore vendored Normal file
View File

@ -0,0 +1,7 @@
venv
logs
*.pyc
__pycache__
.vscode
private*
debug*

View File

View File

@ -0,0 +1,82 @@
from datetime import datetime
import logging
from urllib.parse import urljoin
from typing import Dict, List
from .models.build import Build
from .models.build_task import BuildTask
import requests
TZ_OFFSET = '+00:00'
class APIclient():
"""
client for working with ALBS API
"""
def __init__(self, api_root: str, jwt: str):
self.api_root = api_root
self.jwt = jwt
def get_builds(self, page_num: int = 1) -> List[Build]:
ep = '/api/v1/builds'
url = urljoin(self.api_root, ep)
params = {'pageNumber': page_num}
headers = {'accept': 'appilication/json'}
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
result = []
for b in response.json()['builds']:
try:
result.append(self._parse_build(b))
except Exception as err: # pylint: disable=broad-except
logging.error("Cant convert build JSON %s to Buildmodel: %s",
b, err, exc_info=True)
return result
def _parse_build_tasks(self, tasks_json: Dict, build_id: int) -> List[BuildTask]:
result = []
for task in tasks_json:
try:
started_at = datetime.fromisoformat(
task['started_at']+TZ_OFFSET) \
if task['started_at'] else None
finished_at = datetime.fromisoformat(task['finished_at']+TZ_OFFSET) \
if task['finished_at'] else None
params = {'id': task['id'],
'build_id': build_id,
'started_at': started_at,
'finished_at': finished_at,
'arch': task['arch'],
'status_id': task['status']}
result.append(BuildTask(**params))
except Exception as err: # pylint: disable=broad-except
logging.error("Cant convert build_task JSON %s (build_id %s) to BuildTask model: %s",
task, build_id, err, exc_info=True)
result.sort(key=lambda x: x.id, reverse=True)
return result
def _parse_build(self, build_json: Dict) -> Build:
url = f"https://build.almalinux.org/build/{build_json['id']}"
created_at = datetime.fromisoformat(build_json['created_at']+TZ_OFFSET)
finished_at = datetime.fromisoformat(build_json['finished_at']+TZ_OFFSET) \
if build_json['finished_at'] else None
build_tasks = self._parse_build_tasks(
build_json['tasks'], build_json['id'])
params = {
'id': build_json['id'],
'url': url,
'created_at': created_at,
'finished_at': finished_at,
'build_tasks': build_tasks}
return Build(**params)

View File

@ -0,0 +1,66 @@
from datetime import datetime
from typing import Union
import psycopg2
from .models.build_db import BuildDB
from .models.build_task_db import BuildTaskDB
from .models.db_config import DbConfig
class DB():
def __init__(self, config: DbConfig):
self.conf = config
def _get_conn(self):
conn = psycopg2.connect(database=self.conf.name,
host=self.conf.host,
user=self.conf.username,
password=self.conf.password,
port=self.conf.port)
return conn
def insert_update_build(self, build: BuildDB):
sql = f'''
INSERT INTO builds(id, url, created_at, finished_at)
VALUES (%s, %s, %s, %s)
ON CONFLICT (id) DO UPDATE SET
(url, created_at, finished_at) = (EXCLUDED.url, EXCLUDED.created_at, EXCLUDED.finished_at);
'''
with self._get_conn() as conn:
cur = conn.cursor()
cur.execute(sql, (build.id, build.url,
build.created_at, build.finished_at))
conn.commit()
def insert_update_buildtask(self, build_task: BuildTaskDB):
sql = f"""
INSERT INTO build_tasks(id, build_id, arch_id, started_at, finished_at, status_id)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (id) DO UPDATE SET
(id, build_id, arch_id, started_at, finished_at, status_id) = (EXCLUDED.ID, EXCLUDED.build_id, EXCLUDED.arch_id, EXCLUDED.started_at, EXCLUDED.finished_at, EXCLUDED.status_id)
"""
with self._get_conn() as conn:
cur = conn.cursor()
cur.execute(sql, (build_task.id, build_task.build_id, build_task.arch_id,
build_task.started_at, build_task.finished_at, build_task.status_id))
conn.commit()
def get_latest_build_id(self) -> Union[int, None]:
sql = "SELECT id from builds ORDER BY id DESC LIMIT 1;"
with self._get_conn() as conn:
cur = conn.cursor()
cur.execute(sql)
val = cur.fetchone()
if not val:
return None
return int(val[0])
def cleanup_builds(self, oldest_to_keep: datetime) -> int:
params = (int(oldest_to_keep.timestamp()),)
sql = "DELETE FROM builds WHERE created_at < %s;"
with self._get_conn() as conn:
cur = conn.cursor()
cur.execute(sql, params)
conn.commit()
return cur.rowcount

View File

@ -0,0 +1,45 @@
import logging
from ..models.extractor_config import ExtractorConfig
from ..api_client import APIclient
from ..db import DB
class Extractor:
def __init__(self, config: ExtractorConfig, api: APIclient, db: DB):
self.oldest_build_age = config.oldest_build_age
self.api = api
self.db = db
def extract_and_store(self) -> int:
build_count = 0
page_num = 1
last_build_id = self.db.get_latest_build_id()
if not last_build_id:
last_build_id = 0
logging.info(f"last_build_id: {last_build_id}")
stop = False
while not stop:
logging.info(f"page: {page_num}")
for build in self.api.get_builds(page_num):
# check if we shoud stop processing build
if build.id <= last_build_id or \
build.created_at <= self.oldest_build_age:
stop = True
break
# inserting build and build tasks
logging.info(f"inserting {build.id}")
self.db.insert_update_build(build.as_db_model())
for build_task in build.build_tasks:
self.db.insert_update_buildtask(build_task.as_db_model())
build_count += 1
page_num += 1
return build_count
def build_cleanup(self):
logging.info('Removing all buidls older then %s',
self.oldest_build_age.strftime("%m/%d/%Y, %H:%M:%S"))
removed_count = self.db.cleanup_builds(self.oldest_build_age)
logging.info('removed %d entries', removed_count)

View File

@ -0,0 +1,71 @@
from datetime import datetime, timedelta
import logging
from logging.handlers import RotatingFileHandler
import yaml
from ..api_client import APIclient
from ..db import DB
from .extractor import Extractor
from ..models.extractor_config import ExtractorConfig
from ..models.db_config import DbConfig
def __get_oldest_build_age(config: dict) -> datetime:
oldest_build_age = datetime.now().astimezone() \
- timedelta(days=config['data_store_days'])
return oldest_build_age
def __get_db_config(config: dict) -> DbConfig:
return DbConfig(name=config['db_name'],
port=int(config['db_port']),
host=config['db_host'],
username=config['db_username'],
password=config['db_password'])
def __get_config(yml_path: str) -> ExtractorConfig:
"""
get_config loads yml file and generates instance
"""
with open(yml_path, 'r', encoding='utf-8') as flr:
raw = yaml.safe_load(flr)
# adding new attrs
raw['oldest_build_age'] = __get_oldest_build_age(raw)
raw['db_config'] = __get_db_config(raw)
return ExtractorConfig(**raw)
def start(yml_path: str):
config = __get_config(yml_path)
# configuring logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s <%(funcName)s> %(message)s',
handlers=[RotatingFileHandler(config.log_file,
maxBytes=10000000,
backupCount=3)])
api = APIclient(api_root=config.albs_url, jwt=config.jwt)
db = DB(config.db_config)
extractor = Extractor(config, api, db)
logging.info('Starting builds insertion')
inserted_count = -1
try:
inserted_count = extractor.extract_and_store()
except Exception as err: # pylint: disable=broad-except
logging.critical("Unhandled exeption %s", err, exc_info=True)
else:
logging.info(
'Build extaction was finished. %d builds were inserted', inserted_count)
logging.info('Starting old builds removal')
try:
extractor.build_cleanup()
except Exception as err:
logging.critical("Unhandled exeption %s", err, exc_info=True)
else:
logging.info('Cleanup finished')

View File

@ -0,0 +1,27 @@
from datetime import datetime
from typing import List, Optional
from pydantic import BaseModel, HttpUrl
from typing import Any
from .build_task import BuildTask
from .build_db import BuildDB
class Build(BaseModel):
id: int
url: HttpUrl
build_tasks: List[BuildTask]
created_at: datetime
finished_at: Optional[datetime] = None
def as_db_model(self) -> BuildDB:
created_at = self.created_at.timestamp()
finished_at = self.finished_at.timestamp() \
if self.finished_at else None
params = {
'id': self.id,
'url': self.url,
'created_at': created_at,
'finished_at': finished_at
}
return BuildDB(**params)

View File

@ -0,0 +1,12 @@
from typing import Optional, Dict, Any
from pydantic import BaseModel, HttpUrl
class BuildDB(BaseModel):
"""
Build as it received from/sent to database
"""
id: int
url: HttpUrl
created_at: int
finished_at: Optional[float] = None

View File

@ -0,0 +1,31 @@
from datetime import datetime
from typing import Optional
from pydantic import BaseModel
from .build_task_db import BuildTaskDB
from .enums import ArchEnum
class BuildTask(BaseModel):
id: int
build_id: int
arch: str
started_at: Optional[datetime] = None
finished_at: Optional[datetime] = None
status_id: int
def as_db_model(self) -> BuildTaskDB:
started_at = self.started_at.timestamp() \
if self.started_at else None
finished_at = self.finished_at.timestamp() \
if self.finished_at else None
params = {
'id': self.id,
'build_id': self.build_id,
'arch_id': ArchEnum[self.arch].value,
'started_at': started_at,
'finished_at': finished_at,
'status_id': self.status_id
}
return BuildTaskDB(**params)

View File

@ -0,0 +1,14 @@
from typing import Optional
from pydantic import BaseModel
class BuildTaskDB(BaseModel):
"""
BuildTask as it received from/sent to database
"""
id: int
build_id: int
arch_id: int
started_at: Optional[float] = None
finished_at: Optional[float] = None
status_id: int

View File

@ -0,0 +1,9 @@
from pydantic import BaseModel, Field
class DbConfig(BaseModel):
name: str = Field(description="db name")
port: int = Field(description="db server port")
host: str = Field(description="db server ip/hostname")
username: str = Field(description="username to connect with")
password: str = Field(description="password to connect with1")

View File

@ -0,0 +1,17 @@
from enum import IntEnum
class ArchEnum(IntEnum):
i686 = 0
x86_64 = 1
aarch64 = 2
ppc64le = 3
s390x = 4
class BuildTaskEnum(IntEnum):
idle = 0
started = 1
completed = 2
failed = 3
excluded = 4

View File

@ -0,0 +1,24 @@
from datetime import datetime
from pathlib import Path
from pydantic import HttpUrl, Field, BaseModel
from .db_config import DbConfig
# DEFAULTS
ALBS_URL_DEFAULT = 'https://build.almalinux.org'
LOG_FILE_DEFAULT = '/tmp/extractor.log'
class ExtractorConfig(BaseModel):
"""
config model for Extractor service
"""
log_file: Path = Field(description='logfile path',
default=LOG_FILE_DEFAULT)
albs_url: HttpUrl = Field(description='ALBS root URL',
default=ALBS_URL_DEFAULT)
oldest_build_age: datetime = \
Field(description='oldest build age to extract and store')
jwt: str = Field(description='ALBS JWT token')
db_config: DbConfig = Field(description="database configuration")

View File

@ -0,0 +1,12 @@
from pydantic import BaseModel
class SignTaskDB(BaseModel):
"""
SignTaskDB as it received from/sent to database
"""
id: int
build_id: int
build_task_id: int
started_at: int
finished_at: int

View File

@ -0,0 +1,91 @@
-- builds
DROP TABLE IF EXISTS builds CASCADE;
CREATE TABLE builds (
id INTEGER PRIMARY KEY,
url VARCHAR(50) NOT NULL,
created_at REAL NOT NULL,
finished_at REAL
);
CREATE INDEX IF NOT EXISTS builds_created_at
ON builds(created_at);
CREATE INDEX IF NOT EXISTS builds_finished_at
ON builds(finished_at);
-- build_taks_enum
DROP TABLE IF EXISTS build_task_enum CASCADE;
CREATE TABLE IF NOT EXISTS build_task_enum(
id INTEGER PRIMARY KEY,
value VARCHAR(15)
);
INSERT INTO build_task_enum (id, value)
VALUES
(0, 'idle'),
(1, 'started'),
(2, 'completed'),
(3, 'failed'),
(4, 'excluded');
-- arch_enum
DROP TABLE IF EXISTS arch_enum CASCADE;
CREATE TABLE arch_enum(
id INTEGER PRIMARY KEY,
value VARCHAR(15)
);
INSERT INTO arch_enum(id, value)
VALUES
(0, 'i686'),
(1, 'x86_64'),
(2, 'aarch64'),
(3, 'ppc64le'),
(4, 's390x');
-- build_tasks
DROP TABLE IF EXISTS build_tasks CASCADE;
CREATE TABLE build_tasks (
id INTEGER PRIMARY KEY,
build_id INTEGER REFERENCES builds(id) ON DELETE CASCADE,
arch_id INTEGER REFERENCES arch_enum(id) ON DELETE SET NULL,
status_id INTEGER REFERENCES build_task_enum(id) ON DELETE SET NULL,
started_at REAL,
finished_at REAL
);
CREATE INDEX build_tasks_build_id
ON build_tasks(build_id);
CREATE INDEX build_tasks_started_at
ON build_tasks(started_at);
CREATE INDEX build_tasks_finished_at
ON build_tasks(finished_at);
-- sign_tasks
DROP TABLE IF EXISTS sign_tasks CASCADE;
CREATE TABLE sign_tasks (
id INTEGER PRIMARY KEY,
build_id INTEGER REFERENCES builds(id) ON DELETE CASCADE,
buildtask_id INTEGER REFERENCES build_tasks(id) ON DELETE CASCADE,
started_at REAL,
finished_at REAL
);
CREATE INDEX sign_tasks_build_id
ON sign_tasks(build_id);
CREATE INDEX sign_tasks_buildtask_id
ON sign_tasks(buildtask_id);
CREATE INDEX sing_tasks_started_at
ON sign_tasks(started_at);
CREATE INDEX sign_tasks_finished_at
ON sign_tasks(finished_at);

View File

@ -0,0 +1,12 @@
"""
extractor startup script
"""
import sys
from build_analytics.extractor.start import start
try:
YAML_PATH = sys.argv[1]
except IndexError:
print(f"Usage: {sys.argv[0]} config.yml")
sys.exit(1)
start(YAML_PATH)