Compare commits

..

6 Commits

Author SHA1 Message Date
c731cba102 Release 0.3.6 (2024-10-08)
build_analytics:
  buildsystem#360 Added src and x86_64_v2 arches
2024-10-08 16:18:11 +02:00
7c05bbacb6 Release 0.3.5 (2023-06-01)
build_analytics:
  ALBS-1103 start using persistent HTTP connections
2023-06-01 11:57:27 +02:00
d47fe3b4cd Release 0.3.4 (2023-05-12)
build_analytics
  - Bigfix ALBS-1111
2023-05-12 11:22:55 +02:00
f74bc0748a 0.3.3 (2023-04-24)
build-analytics
  Improvements
    - [ALBS-1077] start deleting builds that were removed from ALBS
  Bugfixes
    - 'Key error' when db_port/db_host is not set
    - update_builds() ignoring opldest_to_update attribute
    - [ALBS-1099] Test task started_at attribute is NULL
    - Max recursion error in 'Test task details.json'
2023-04-24 09:20:58 +02:00
5a590cbadb built_analytics:
[ALBS-1077] Now we delete build if it was deleted from ALBS
  Bugfix 'Key error' when db_port/db_host is not set
  Bugfix update_builds ignoring opldest_to_update attribute
2023-04-21 15:13:48 +02:00
kzhukov
4b5adb52d5 ALBS-1099 (#4)
Co-authored-by: Kirill Zhukov <kzhukov@cloudlinux.com>
Reviewed-on: #4
2023-04-21 07:53:09 +00:00
10 changed files with 153 additions and 63 deletions

View File

@ -27,6 +27,8 @@ class APIclient():
self.api_root = api_root self.api_root = api_root
self.jwt = jwt self.jwt = jwt
self.timeout = timeout self.timeout = timeout
# will be set at first call of __send_request
self.session: Optional[requests.Session] = None
def get_builds(self, page_num: int = 1) -> List[Build]: def get_builds(self, page_num: int = 1) -> List[Build]:
ep = '/api/v1/builds' ep = '/api/v1/builds'
@ -34,8 +36,7 @@ class APIclient():
params = {'pageNumber': page_num} params = {'pageNumber': page_num}
headers = {'accept': 'appilication/json'} headers = {'accept': 'appilication/json'}
response = requests.get( response = self.__send_request(url, 'get', params, headers)
url, params=params, headers=headers, timeout=self.timeout)
response.raise_for_status() response.raise_for_status()
result = [] result = []
@ -47,11 +48,18 @@ class APIclient():
b, err, exc_info=True) b, err, exc_info=True)
return result return result
def get_build(self, build_id: int) -> Build: def get_build(self, build_id: int) -> Optional[Build]:
'''
method returns None if build was deleted from ALBS
'''
ep = f'/api/v1/builds/{build_id}' ep = f'/api/v1/builds/{build_id}'
url = urljoin(self.api_root, ep) url = urljoin(self.api_root, ep)
headers = {'accept': 'application/json'} headers = {'accept': 'application/json'}
response = requests.get(url, headers=headers, timeout=self.timeout) response = self.__send_request(url, 'get', headers=headers)
if response.status_code == 404:
return None
response.raise_for_status() response.raise_for_status()
return self._parse_build(response.json()) return self._parse_build(response.json())
@ -221,3 +229,31 @@ class APIclient():
start_ts = stat.start_ts start_ts = stat.start_ts
return start_ts return start_ts
def __send_request(self,
url: str,
method: str,
params: Optional[Dict[str, Any]] = None,
headers: Optional[Dict[str, Any]] = None,
) -> requests.Response:
"""
Simple wrapper around requests.get/posts.. methods
so we can use same session between API calls
"""
if not self.session:
self.session = requests.Session()
m = getattr(self.session, method, None)
if not m:
raise ValueError(f"method {method} is not supported")
# pylint: disable=not-callable
return m(url, params=params, headers=headers, timeout=self.timeout)
def close_session(self):
if self.session:
self.session.close()
self.session = None
def __del__(self):
self.close_session()

View File

@ -3,7 +3,7 @@
from enum import IntEnum from enum import IntEnum
# supported schema version # supported schema version
DB_SCHEMA_VER = 3 DB_SCHEMA_VER = 4
# ENUMS # ENUMS
@ -13,6 +13,8 @@ class ArchEnum(IntEnum):
aarch64 = 2 aarch64 = 2
ppc64le = 3 ppc64le = 3
s390x = 4 s390x = 4
src = 5
x86_64_v2 = 6
class BuildTaskEnum(IntEnum): class BuildTaskEnum(IntEnum):

View File

@ -62,34 +62,34 @@ class DB():
build_task.started_at, build_task.finished_at, build_task.status_id)) build_task.started_at, build_task.finished_at, build_task.status_id))
# inserting web node stats # inserting web node stats
for stat in web_node_stats: for wn_stat in web_node_stats:
# do not insert empty stats # do not insert empty stats
if stat.start_ts is None: if wn_stat.start_ts is None:
continue continue
sql = ''' sql = '''
INSERT INTO web_node_stats (build_task_id, stat_name_id, start_ts, end_ts) INSERT INTO web_node_stats (build_task_id, stat_name_id, start_ts, end_ts)
VALUES (%s, %s, %s, %s); VALUES (%s, %s, %s, %s);
''' '''
cur.execute(sql, (stat.build_task_id, stat.stat_name_id, cur.execute(sql, (wn_stat.build_task_id, wn_stat.stat_name_id,
stat.start_ts, stat.end_ts)) wn_stat.start_ts, wn_stat.end_ts))
logging.debug('raw SQL query: %s', cur.query) logging.debug('raw SQL query: %s', cur.query)
self.__conn.commit() self.__conn.commit()
# inserting build node stats # inserting build node stats
for stat in build_node_stats: for bn_stat in build_node_stats:
# do not insert empty stats # do not insert empty stats
if stat.start_ts is None: if bn_stat.start_ts is None:
continue continue
sql = ''' sql = '''
INSERT INTO build_node_stats(build_task_id, stat_name_id, start_ts, end_ts) INSERT INTO build_node_stats(build_task_id, stat_name_id, start_ts, end_ts)
VALUES (%s, %s, %s, %s); VALUES (%s, %s, %s, %s);
''' '''
cur.execute(sql, (stat.build_task_id, stat.stat_name_id, cur.execute(sql, (bn_stat.build_task_id, bn_stat.stat_name_id,
stat.start_ts, stat.end_ts)) bn_stat.start_ts, bn_stat.end_ts))
logging.debug('raw SQL query: %s', cur.query) logging.debug('raw SQL query: %s', cur.query)
# commiting changes # commiting changes
@ -121,11 +121,12 @@ class DB():
# getting unfinished builds # getting unfinished builds
sql = 'SELECT id FROM builds where finished_at is NULL AND created_at > %s;' sql = 'SELECT id FROM builds where finished_at is NULL AND created_at > %s;'
builds_to_check: Dict[int, bool] = {}
cur = self.__conn.cursor() cur = self.__conn.cursor()
cur.execute(sql, (not_before.timestamp(),)) cur.execute(sql, (not_before.timestamp(),))
logging.debug('raw SQL query: %s', cur.query) logging.debug('raw SQL query: %s', cur.query)
for row in cur.fetchall(): for row in cur.fetchall():
res[row[0]] = {} builds_to_check[row[0]] = True
# getting list of unfinished tasks # getting list of unfinished tasks
sql = 'SELECT id, build_id, status_id FROM build_tasks WHERE status_id < 2;' sql = 'SELECT id, build_id, status_id FROM build_tasks WHERE status_id < 2;'
@ -135,6 +136,8 @@ class DB():
build_task_id: int = row[0] build_task_id: int = row[0]
build_id: int = row[1] build_id: int = row[1]
status_id: int = row[2] status_id: int = row[2]
if build_id not in builds_to_check:
continue
try: try:
res[build_id][build_task_id] = status_id res[build_id][build_task_id] = status_id
except KeyError: except KeyError:
@ -195,11 +198,11 @@ class DB():
logging.debug('raw SQL query: %s', cur.query) logging.debug('raw SQL query: %s', cur.query)
# updating build_node_stats # updating build_node_stats
for stat in build_node_stats: for bn_stat in build_node_stats:
logging.debug( logging.debug(
'updating build_node_stats %s build_task %s', stat.stat_name_id, build_task.id) 'updating build_node_stats %s build_task %s', bn_stat.stat_name_id, build_task.id)
if self.stat_exists(task_id=stat.build_task_id, if self.stat_exists(task_id=bn_stat.build_task_id,
stat_name_id=stat.stat_name_id, stat_name_id=bn_stat.stat_name_id,
table_name='build_node_stats', table_name='build_node_stats',
column_name='build_task_id'): column_name='build_task_id'):
sql = ''' sql = '''
@ -213,9 +216,9 @@ class DB():
VALUES (%(build_task_id)s, %(stat_name_id)s, %(start_ts)s, %(end_ts)s); VALUES (%(build_task_id)s, %(stat_name_id)s, %(start_ts)s, %(end_ts)s);
''' '''
params = {'build_task_id': build_task.id, params = {'build_task_id': build_task.id,
'stat_name_id': stat.stat_name_id, 'stat_name_id': bn_stat.stat_name_id,
'start_ts': stat.start_ts, 'start_ts': bn_stat.start_ts,
'end_ts': stat.end_ts} 'end_ts': bn_stat.end_ts}
logging.debug('raw SQL query: %s', cur.query) logging.debug('raw SQL query: %s', cur.query)
cur.execute(sql, params) cur.execute(sql, params)
@ -318,3 +321,11 @@ class DB():
s.start_ts, s.finish_ts)) s.start_ts, s.finish_ts))
# commiting changes # commiting changes
self.__conn.commit() self.__conn.commit()
def delete_build(self, build_id: int):
params = (build_id,)
sql = "DELETE FROM builds WHERE id = %s;"
cur = self.__conn.cursor()
cur.execute(sql, params)
self.__conn.commit()

View File

@ -1,8 +1,10 @@
# pylint: disable=relative-beyond-top-level # pylint: disable=relative-beyond-top-level
from datetime import datetime, timedelta
import logging import logging
from typing import Dict, List from typing import Dict, List
from ..api_client import APIclient from ..api_client import APIclient
from ..const import BuildTaskEnum from ..const import BuildTaskEnum
from ..db import DB from ..db import DB
@ -26,11 +28,13 @@ class Extractor:
stop = False stop = False
while not stop: while not stop:
oldest_build_age = datetime.now().astimezone() - \
timedelta(days=self.config.data_store_days)
logging.info("page: %s", page_num) logging.info("page: %s", page_num)
for build in self.api.get_builds(page_num): for build in self.api.get_builds(page_num):
# check if we shoud stop processing build # check if we shoud stop processing build
if build.id <= last_build_id or \ if build.id <= last_build_id or \
build.created_at <= self.config.oldest_build_age: build.created_at <= oldest_build_age:
stop = True stop = True
break break
@ -73,9 +77,10 @@ class Extractor:
return build_count return build_count
def build_cleanup(self): def build_cleanup(self):
logging.info('Removing all buidls older then %s', oldest_to_keep = datetime.now().astimezone() - \
self.config.oldest_build_age.strftime("%m/%d/%Y, %H:%M:%S")) timedelta(days=self.config.data_store_days)
removed_count = self.db.cleanup_builds(self.config.oldest_build_age) logging.info('Removing all buidls older then %s', oldest_to_keep)
removed_count = self.db.cleanup_builds(oldest_to_keep)
logging.info('removed %d entries', removed_count) logging.info('removed %d entries', removed_count)
def __update_build_tasks(self, build_tasks: List[BuildTask], def __update_build_tasks(self, build_tasks: List[BuildTask],
@ -105,13 +110,20 @@ class Extractor:
b.build_id, b.id, BuildTaskEnum(b.status_id).name) b.build_id, b.id, BuildTaskEnum(b.status_id).name)
def update_builds(self): def update_builds(self):
logging.info('Getting list of tasks from DB') not_before = datetime.now().astimezone() - \
unfinished_tasks = self.db.get_unfinished_builds( timedelta(days=self.config.oldest_to_update_days)
self.config.oldest_to_update) logging.info('Getting unfinished builds that were created after %s ',
not_before)
unfinished_tasks = self.db.get_unfinished_builds(not_before)
for build_id, build_tasks_db in unfinished_tasks.items(): for build_id, build_tasks_db in unfinished_tasks.items():
try: try:
logging.info('Getting status of build %d', build_id) logging.info('Getting status of build %d', build_id)
build = self.api.get_build(build_id) build = self.api.get_build(build_id)
if not build:
logging.warning(
"build %s was deleted from albs, removing it", build_id)
self.db.delete_build(build_id)
continue
logging.info('Updating build tasks') logging.info('Updating build tasks')
build_tasks_to_check = [ build_tasks_to_check = [
@ -131,10 +143,12 @@ class Extractor:
build_id, err, exc_info=True) build_id, err, exc_info=True)
def updating_test_tasks(self): def updating_test_tasks(self):
not_before = datetime.now().astimezone() - \
timedelta(days=self.config.oldest_to_update_days)
logging.info('getting build tasks for builds created after %s', logging.info('getting build tasks for builds created after %s',
self.config.oldest_to_update) not_before)
build_task_ids = self.db.get_build_tasks_for_tests_update( build_task_ids = self.db.get_build_tasks_for_tests_update(
self.config.oldest_to_update) not_before)
for build_task_id in build_task_ids: for build_task_id in build_task_ids:
try: try:
logging.info('getting tests for build task %s', build_task_id) logging.info('getting tests for build task %s', build_task_id)

View File

@ -1,8 +1,8 @@
from datetime import datetime, timedelta
import logging import logging
from logging.handlers import RotatingFileHandler from logging.handlers import RotatingFileHandler
import sys import sys
import time import time
from typing import Dict, Any
import yaml import yaml
@ -22,19 +22,15 @@ def __get_config(yml_path: str) -> ExtractorConfig:
with open(yml_path, 'r', encoding='utf-8') as flr: with open(yml_path, 'r', encoding='utf-8') as flr:
raw = yaml.safe_load(flr) raw = yaml.safe_load(flr)
# adding new attrs # Dbconfig
raw['oldest_build_age'] = datetime.now().astimezone() \ db_params: Dict[str, Any] = {'name': raw['db_name'],
- timedelta(days=raw['data_store_days']) 'username': raw['db_username'],
'password': raw['db_password'], }
raw['db_config'] = DbConfig(name=raw['db_name'], if 'db_port' in raw:
port=int(raw['db_port']), db_params['port'] = raw['db_port']
host=raw['db_host'], if 'db_host' in raw:
username=raw['db_username'], db_params['host'] = raw['db_host']
password=raw['db_password']) raw['db_config'] = DbConfig(**db_params)
if 'oldest_to_update_days' in raw:
raw['oldest_to_update_days'] = datetime.now().astimezone() \
- timedelta(days=raw['oldest_to_update_days'])
return ExtractorConfig(**raw) return ExtractorConfig(**raw)
@ -103,7 +99,10 @@ def start(yml_path: str):
else: else:
logging.info('test tasks were updated') logging.info('test tasks were updated')
# freeing up resources
extractor.db.close_conn() extractor.db.close_conn()
extractor.api.close_session()
logging.info("Extraction was finished") logging.info("Extraction was finished")
logging.info("Sleeping for %d seconds", config.scrape_interval) logging.info("Sleeping for %d seconds", config.scrape_interval)
time.sleep(config.scrape_interval) time.sleep(config.scrape_interval)

View File

@ -1,9 +1,13 @@
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
DB_PORT = 5432
DB_HOST = "localhost"
class DbConfig(BaseModel): class DbConfig(BaseModel):
name: str = Field(description="db name") name: str = Field(description="db name")
port: int = Field(description="db server port") port: int = Field(description="db server port", default=DB_PORT)
host: str = Field(description="db server ip/hostname") host: str = Field(description="db server ip/hostname", default=DB_HOST)
username: str = Field(description="username to connect with") username: str = Field(description="username to connect with")
password: str = Field(description="password to connect with1") password: str = Field(description="password to connect with1")

View File

@ -1,4 +1,3 @@
from datetime import datetime, timedelta
from pathlib import Path from pathlib import Path
from pydantic import HttpUrl, Field, BaseModel # pylint: disable=no-name-in-module from pydantic import HttpUrl, Field, BaseModel # pylint: disable=no-name-in-module
@ -8,10 +7,10 @@ from .db_config import DbConfig
# DEFAULTS # DEFAULTS
ALBS_URL_DEFAULT = 'https://build.almalinux.org' ALBS_URL_DEFAULT = 'https://build.almalinux.org'
LOG_FILE_DEFAULT = '/tmp/extractor.log' LOG_FILE_DEFAULT = '/tmp/extractor.log'
API_DEFAULT = 30 API_TIMEOUT_DEFAULT = 30
SCRAPE_INTERVAL_DEFAULT = 3600 SCRAPE_INTERVAL_DEFAULT = 3600
START_FROM_DEFAULT = 5808 START_FROM_DEFAULT = 5808
OLDEST_TO_UPDATE_DEFAULT = datetime.now().astimezone() - timedelta(days=7) OLDEST_TO_UPDATE_DAYS_DEFAULT = 7
class ExtractorConfig(BaseModel): class ExtractorConfig(BaseModel):
@ -22,17 +21,17 @@ class ExtractorConfig(BaseModel):
default=LOG_FILE_DEFAULT) default=LOG_FILE_DEFAULT)
albs_url: HttpUrl = Field(description='ALBS root URL', albs_url: HttpUrl = Field(description='ALBS root URL',
default=ALBS_URL_DEFAULT) default=ALBS_URL_DEFAULT)
oldest_build_age: datetime = \ data_store_days: int = \
Field(description='oldest build age to store') Field(description='oldest build (in days) to keep in DB')
jwt: str = Field(description='ALBS JWT token') jwt: str = Field(description='ALBS JWT token')
db_config: DbConfig = Field(description="database configuration") db_config: DbConfig = Field(description="database configuration")
api_timeout: int = Field( api_timeout: int = Field(
description="max time in seconds to wait for API response", description="max time in seconds to wait for API response",
default=API_DEFAULT) default=API_TIMEOUT_DEFAULT)
scrape_interval: int = Field(description='how often (in seconds) we will extract data from ALBS', scrape_interval: int = Field(description='how often (in seconds) we will extract data from ALBS',
default=SCRAPE_INTERVAL_DEFAULT) default=SCRAPE_INTERVAL_DEFAULT)
start_from: int = Field(description='build id to start populating empty db with', start_from: int = Field(description='build id to start populating empty db with',
default=START_FROM_DEFAULT) default=START_FROM_DEFAULT)
oldest_to_update: datetime = \ oldest_to_update_days: int = \
Field(description='oldest unfinished object (build/task/step...) that we will try to update', Field(description='oldest (in days) unfinished object (build/task/step...) that we will try to update',
default=OLDEST_TO_UPDATE_DEFAULT) default=OLDEST_TO_UPDATE_DAYS_DEFAULT)

View File

@ -10,7 +10,6 @@ albs_url: https://build.almalinux.org
# required: yes # required: yes
jwt: "" jwt: ""
# db_host # db_host
# IP/hostname of database server # IP/hostname of database server
# required: no # required: no
@ -28,7 +27,6 @@ db_port: 5432
# required: yes # required: yes
db_username: albs_analytics db_username: albs_analytics
# db_password # db_password
# password to connect with # password to connect with
# required: yes # required: yes
@ -39,7 +37,6 @@ db_password: super_secret_password
# required: yes # required: yes
db_name: albs_analytics db_name: albs_analytics
# log_file # log_file
# file to write logs to # file to write logs to
# required: no # required: no
@ -62,7 +59,7 @@ scrape_interval: 3600
# default: 5808 (first build with correct metrics) # default: 5808 (first build with correct metrics)
start_from: 5808 start_from: 5808
# oldest_to_update # oldest_to_update_days
# oldest (in days) unfinished object (build/task/step...) that we will try to update # oldest (in days) unfinished object (build/task/step...) that we will try to update
# required: false # required: false
# default: 7 # default: 7

View File

@ -0,0 +1,11 @@
BEGIN;
INSERT INTO arch_enum (id, value)
VALUES
(5, 'src'),
(6, 'x86_64_v2');
UPDATE schema_version
SET version = 4;
COMMIT;

View File

@ -21,7 +21,24 @@ First version
0.3.2 (2023-03-23) 0.3.2 (2023-03-23)
- Bugfix ALBS-1060 - Bugfix ALBS-1060
0.3.3 (IN PROGRESS) 0.3.3 (2023-04-24)
build-analytics: build-analytics
- [ALBS-1099] change source of Test task started_at timestamp Improvements
- [ALBS-1077] start deleting builds that were removed from ALBS - [ALBS-1077] start deleting builds that were removed from ALBS
Bugfixes
- 'Key error' when db_port/db_host is not set
- update_builds() ignoring odldest_to_update attribute
- [ALBS-1099] Test task started_at attribute is NULL
- Max recursion error in 'Test task details.json'
0.3.4 (2023-05-12)
build_analytics
- Bigfix ALBS-1111
0.3.5 (2023-06-01)
build_analytics:
ALBS-1103 start using persistent HTTP connections
0.3.6 (2024-10-08)
build_analytics:
buildsystem#360 Added src and x86_64_v2 arches