diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index 93604e8..4e1b6c5 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -1,9 +1,19 @@ -name: Pylint +name: Linters on: [push] jobs: - build: + + docker-lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: hadolint/hadolint-action@v2.1.0 + with: + recursive: true + ignore: DL3041 + + python-lint: runs-on: ubuntu-latest strategy: diff --git a/Dockerfile b/Dockerfile index ad6742e..0ab5138 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,12 +2,12 @@ FROM fedora:36 WORKDIR /root -# for nvme-stas -RUN dnf install -y python3-dasbus python3-pyudev python3-systemd python3-gobject meson -# for libnvme -RUN dnf install -y git gcc g++ cmake openssl-devel libuuid-devel json-c-devel swig python-devel meson +# first line for nvme-stas +# second line for libnvme +RUN dnf install -y python3-dasbus python3-pyudev python3-systemd python3-gobject meson \ + git gcc g++ cmake openssl-devel libuuid-devel json-c-devel swig python-devel meson && dnf clean all COPY . . -RUN meson .build && ninja -C .build && cd .build && meson install +RUN meson .build && ninja -C .build && meson install -C .build ENTRYPOINT ["python3"] diff --git a/NEWS.md b/NEWS.md index d1515cd..f56a7c9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,7 @@ - Fix issues with I/O controller connection audits - Eliminate pcie devices from list of I/O controller connections to audit - Add soaking timer to workaround race condition between kernel and user-space applications on "add" uevents. When the kernel adds a new nvme device (e.g. `/dev/nvme7`) and sends a "add" uevent to notify user-space applications, the attributes associated with that device (e.g. `/sys/class/nvme/nvme7/cntrltype`) may not be fully initialized which can lead `stacd` to dismiss a device that should get audited. +- Make `sticky-connections=enabled` the default (see `stacd.conf`) ## Changes with release 1.1.5 @@ -32,7 +33,7 @@ stacd: Bug fix. Check that self._cfg_soak_tmr is not None before dereferencing i ## Changes with release 1.1.1 -Make `sticky-connections-disabled` by default +Make `sticky-connections=disabled` the default (see `stacd.conf`) ## Changes with release 1.1 diff --git a/coverage.sh.in b/coverage.sh.in index 96b8c53..5ba2ebe 100755 --- a/coverage.sh.in +++ b/coverage.sh.in @@ -38,14 +38,24 @@ PRIMARY_GRP=$( id -ng ) PRIMARY_USR=$( id -nu ) PYTHON_PATH=.:./subprojects/libnvme +log() { + msg="$1" + printf "%b[1;36m%s%b[0m\n" "\0033" "${msg}" "\0033" + sudo logger -i "@@@@@ COVERAGE -" -p 4 "${msg}" +} + sd_stop() { - unit="$1"-cov.service + app="$1" + unit="${app}"-cov.service + log "Stop ${app}" sudo systemctl stop "${unit}" >/dev/null 2>&1 sudo systemctl reset-failed "${unit}" >/dev/null 2>&1 } sd_restart() { - unit="$1"-cov.service + app="$1" + unit="${app}"-cov.service + log "Restart ${app}" sudo systemctl restart "${unit}" >/dev/null 2>&1 } @@ -61,7 +71,7 @@ sd_start() { cmd="${app} --syslog -f ${conf}" fi - printf "\n%b[1;36m%s%b[0m\n" "\0033" "Start ${app}" "\0033" + log "Start ${app}" RUNTIME_DIRECTORY=/tmp/${app} rm -rf ${RUNTIME_DIRECTORY} @@ -75,7 +85,7 @@ reload_cfg() { app="$1" unit="${app}"-cov.service pid=$( systemctl show --property MainPID --value "${unit}" ) - printf "%b[1;36m%s%b[0m\n" "\0033" "Reload config ${app}" "\0033" + log "Reload config ${app}" sudo kill -HUP "${pid}" } @@ -83,15 +93,24 @@ if [ ! -d coverage ]; then mkdir coverage fi + +log "START-START-START-START-START-START-START-START-START-START-START-START" + + + ################################################################################ # Load nvme kernel module +log "modprobe nvme-tcp" sudo /usr/sbin/modprobe nvme-tcp +log "nvme disconnect-all" sudo nvme disconnect-all ################################################################################ # Create a dummy config file for @STAFD_PROCNAME@ -stafd_conf_fname=$(mktemp /tmp/@STAFD_PROCNAME@.conf.XXXXXX) +file=/tmp/@STAFD_PROCNAME@.conf.XXXXXX +log "Create dummy config file $file" +stafd_conf_fname=$(mktemp $file) cat > "${stafd_conf_fname}" <<'EOF' [Global] tron=true @@ -102,7 +121,9 @@ EOF ################################################################################ # Create a dummy config file for @STACD_PROCNAME@ -stacd_conf_fname=$(mktemp /tmp/@STACD_PROCNAME@.conf.XXXXXX) +file=/tmp/@STACD_PROCNAME@.conf.XXXXXX +log "Create dummy config file $file" +stacd_conf_fname=$(mktemp $file) cat > "${stacd_conf_fname}" <<'EOF' [Global] tron=true @@ -111,6 +132,7 @@ udev-rule=disabled sticky-connections=enabled EOF +log "Stop & Mask Avahi daemon" sudo systemctl stop avahi-daemon.service sudo systemctl stop avahi-daemon.socket sudo systemctl mask avahi-daemon.service @@ -118,11 +140,11 @@ sudo systemctl mask avahi-daemon.socket sleep 1 -printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ status while @STAFD_PROCNAME@ is not running" "\0033" +log "Invoking @STAFD_CTLNAME@ status while @STAFD_PROCNAME@ is not running" coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ ls >/dev/null 2>&1 coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ invalid-command >/dev/null 2>&1 -printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ status while @STACD_PROCNAME@ is not running" "\0033" +log "Invoking @STACD_CTLNAME@ status while @STACD_PROCNAME@ is not running" coverage run --rcfile=.coveragerc @STACD_CTLNAME@ ls >/dev/null 2>&1 coverage run --rcfile=.coveragerc @STACD_CTLNAME@ invalid-command >/dev/null 2>&1 @@ -132,30 +154,33 @@ sd_start "@STAFD_PROCNAME@" "@STAFD_DBUS_NAME@" "${stafd_conf_fname}" sd_start "@STACD_PROCNAME@" "@STACD_DBUS_NAME@" "${stacd_conf_fname}" sleep 3 -printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ status" "\0033" +log "Invoking @STAFD_CTLNAME@ status" coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ status >/dev/null 2>&1 reload_cfg "@STAFD_PROCNAME@" sleep 1 +log "Restart Avahi daemon" sudo systemctl unmask avahi-daemon.socket sudo systemctl unmask avahi-daemon.service sudo systemctl start avahi-daemon.socket sudo systemctl start avahi-daemon.service sleep 2 +log "Change stafd config: tron=true, persistent-connections=false, zeroconf=enable" cat > "${stafd_conf_fname}" <<'EOF' [Global] tron=true persistent-connections=false [Service Discovery] -zeroconf=disabled +zeroconf=enabled EOF reload_cfg "@STAFD_PROCNAME@" sleep 1 +log "Change stafd config: ip-family=ipv4, kato=10, adding multiple controllers" cat > "${stafd_conf_fname}" <<'EOF' [Global] tron=true @@ -172,11 +197,15 @@ controller=transport=tcp;traddr=abracadabra controller= controller=trsvcid controller=transport=rdma;traddr=!@#$ +controller=transport=fc;traddr=21:00:00:00:00:00:00:00;host-traddr=20:00:00:00:00:00:00:00 +controller=transport=XM;traddr=2.2.2.2 blacklist=transport=tcp;traddr=1.1.1.1 blacklist=transport=tcp;traddr=1000.1000.1000.1000 EOF reload_cfg "@STAFD_PROCNAME@" + +log "Change stacd config: tron=true, udev-rule=disabled, sticky-connections=disabled" cat > "${stacd_conf_fname}" <<'EOF' [Global] tron=true @@ -186,12 +215,12 @@ EOF reload_cfg "@STACD_PROCNAME@" sleep 3 -printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ status" "\0033" +log "Invoking @STAFD_CTLNAME@ status" coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ status >/dev/null 2>&1 ################################################################################ # Fake mDNS packets from a CDC -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Start Avahi publisher" "\0033" +log "Start Avahi publisher" AVAHI_PUBLISHER=mdns_publisher.service sudo systemctl stop ${AVAHI_PUBLISHER} >/dev/null 2>&1 sudo systemctl reset-failed ${AVAHI_PUBLISHER} >/dev/null 2>&1 @@ -200,7 +229,7 @@ sleep 1 ################################################################################ # Start nvme target simulator -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Start nvmet" "\0033" +log "Start nvmet" sudo ../utils/nvmet/nvmet.py clean sudo ../utils/nvmet/nvmet.py create -f ../utils/nvmet/nvmet.conf sleep 2 @@ -210,76 +239,76 @@ reload_cfg "@STACD_PROCNAME@" sleep 3 ################################################################################ -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_PROCNAME@ --version" "\0033" +log "Invoking @STAFD_PROCNAME@ --version" coverage run --rcfile=.coveragerc @STAFD_PROCNAME@ --version -printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_PROCNAME@ --idl" "\0033" +log "Invoking @STAFD_PROCNAME@ --idl" coverage run --rcfile=.coveragerc @STAFD_PROCNAME@ --idl /tmp/@STAFD_PROCNAME@.idl -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_PROCNAME@ --version" "\0033" +log "Invoking @STACD_PROCNAME@ --version" coverage run --rcfile=.coveragerc @STACD_PROCNAME@ --version -printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_PROCNAME@ --idl" "\0033" +log "Invoking @STACD_PROCNAME@ --idl" coverage run --rcfile=.coveragerc @STACD_PROCNAME@ --idl /tmp/@STACD_PROCNAME@.idl ################################################################################ # Stimulate D-Bus activity -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ --version" "\0033" +log "Invoking @STAFD_CTLNAME@ --version" sudo coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ --version -printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ with a bad command" "\0033" +log "Invoking @STAFD_CTLNAME@ with a bad command" sudo coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ blah -printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ troff" "\0033" +log "Invoking @STAFD_CTLNAME@ troff" sudo coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ troff -printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ status" "\0033" +log "Invoking @STAFD_CTLNAME@ status" coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ status >/dev/null 2>&1 -printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ tron" "\0033" +log "Invoking @STAFD_CTLNAME@ tron" sudo coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ tron -printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ ls" "\0033" +log "Invoking @STAFD_CTLNAME@ ls" coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ ls -d >/dev/null 2>&1 -printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ adlp" "\0033" +log "Invoking @STAFD_CTLNAME@ adlp" coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ adlp -d >/dev/null 2>&1 -printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ dlp" "\0033" +log "Invoking @STAFD_CTLNAME@ dlp" coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ dlp -t tcp -a ::1 -s 8009 >/dev/null 2>&1 -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ --version" "\0033" +log "Invoking @STACD_CTLNAME@ --version" sudo coverage run --rcfile=.coveragerc @STACD_CTLNAME@ --version -printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ with a bad command" "\0033" +log "Invoking @STACD_CTLNAME@ with a bad command" sudo coverage run --rcfile=.coveragerc @STACD_CTLNAME@ blah -printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ troff" "\0033" +log "Invoking @STACD_CTLNAME@ troff" sudo coverage run --rcfile=.coveragerc @STACD_CTLNAME@ troff -printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ status" "\0033" +log "Invoking @STACD_CTLNAME@ status" coverage run --rcfile=.coveragerc @STACD_CTLNAME@ status >/dev/null 2>&1 -printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ tron" "\0033" +log "Invoking @STACD_CTLNAME@ tron" sudo coverage run --rcfile=.coveragerc @STACD_CTLNAME@ tron -printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ ls" "\0033" +log "Invoking @STACD_CTLNAME@ ls" coverage run --rcfile=.coveragerc @STACD_CTLNAME@ ls -d >/dev/null 2>&1 ################################################################################ # Stimulate AENs activity by removing/restoring namespaces -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Remove namespace: klingons" "\0033" +log "Remove namespace: klingons" sudo ../utils/nvmet/nvmet.py unlink -p 1 -s klingons sleep 2 -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ ls" "\0033" +log "Invoking @STACD_CTLNAME@ ls" coverage run --rcfile=.coveragerc @STACD_CTLNAME@ ls -d >/dev/null 2>&1 -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Restore namespace: klingons" "\0033" +log "Restore namespace: klingons" sudo ../utils/nvmet/nvmet.py link -p 1 -s klingons sleep 2 -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ ls" "\0033" +log "Invoking @STACD_CTLNAME@ ls" coverage run --rcfile=.coveragerc @STACD_CTLNAME@ ls -d >/dev/null 2>&1 ################################################################################ # Stop Avahi Publisher -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Stop Avahi publisher" "\0033" +log "Stop Avahi publisher" sudo systemctl stop ${AVAHI_PUBLISHER} sleep 1 ################################################################################ -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Restart Avahi publisher" "\0033" +log "Restart Avahi publisher" sudo systemd-run --unit=${AVAHI_PUBLISHER} --working-directory=. avahi-publish -s SFSS _nvme-disc._tcp 8009 "p=tcp" sleep 2 ################################################################################ # Make config changes for @STAFD_PROCNAME@ -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Empty configuration and disable zeroconf for @STAFD_PROCNAME@" "\0033" +log "Empty configuration and disable zeroconf for @STAFD_PROCNAME@" cat > "${stafd_conf_fname}" <<'EOF' [Global] tron=true @@ -293,7 +322,7 @@ sleep 1 ################################################################################ # Make more config changes for @STAFD_PROCNAME@ -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Add single controller (::1) and re-enable zeroconf for @STAFD_PROCNAME@" "\0033" +log "Add single controller (::1) and re-enable zeroconf for @STAFD_PROCNAME@" cat > "${stafd_conf_fname}" <<'EOF' [Global] tron=true @@ -307,24 +336,23 @@ sleep 2 ################################################################################ # Stop Avahi Publisher -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Stop Avahi publisher" "\0033" +log "Stop Avahi publisher" sudo systemctl stop ${AVAHI_PUBLISHER} sleep 2 ################################################################################ # Remove one of the NVMe device's -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Remove (disconnect) nvme1" "\0033" +log "Remove (disconnect) nvme1" sudo nvme disconnect -d nvme1 sleep 2 ################################################################################ -printf "%b[1;36m%s%b[0m\n" "\0033" "Restart @STAFD_PROCNAME@ and @STACD_PROCNAME@" "\0033" sd_restart "@STAFD_PROCNAME@" sd_restart "@STACD_PROCNAME@" sleep 1 -printf "%b[1;36m%s%b[0m\n" "\0033" "Create invalid conditions for saving/loading @STAFD_PROCNAME@'s last known config" "\0033" +log "Create invalid conditions for saving/loading @STAFD_PROCNAME@'s last known config" rm -rf "/tmp/@STAFD_PROCNAME@" sd_stop "@STAFD_PROCNAME@" sd_restart "@STACD_PROCNAME@" @@ -334,7 +362,7 @@ sleep 2 ################################################################################ # Stop everything and collect coverage stats -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Stop @STAFD_PROCNAME@ and @STACD_PROCNAME@" "\0033" +log "Stop @STAFD_PROCNAME@ and @STACD_PROCNAME@" sd_stop "@STAFD_PROCNAME@" sd_stop "@STACD_PROCNAME@" sleep 1 @@ -345,33 +373,49 @@ sudo chown -R "${PRIMARY_USR}":"${PRIMARY_GRP}" coverage >/dev/null 2>&1 sudo chown -R "${PRIMARY_USR}":"${PRIMARY_GRP}" staslib/__pycache__ >/dev/null 2>&1 sudo chown -R "${PRIMARY_USR}":"${PRIMARY_GRP}" subprojects/libnvme/libnvme/__pycache__ >/dev/null 2>&1 +log "nvme disconnect-all" sudo nvme disconnect-all +log "Remove ${stafd_conf_fname} and ${stacd_conf_fname}" rm "${stafd_conf_fname}" rm "${stacd_conf_fname}" +log "Run unit test: test-udev" PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-udev.py +log "Run unit test: test-avahi" PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-avahi.py +log "Run unit test: test-gtimer" PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-gtimer.py +log "Run unit test: test-version" PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-version.py +log "Run unit test: test-transport_id" PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-transport_id.py +log "Run unit test: test-config" PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-config.py +log "Run unit test: test-controller" PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-controller.py +log "Run unit test: test-service" PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-service.py +log "Run unit test: test-log" PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-log.py +log "Run unit test: test-nvme_options" sudo PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-nvme_options.py ################################################################################ # Stop nvme target simulator -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Stop nvmet" "\0033" +log "Stop nvmet" sudo ../utils/nvmet/nvmet.py clean -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Collect all coverage data" "\0033" +log "Collect all coverage data" coverage combine --rcfile=.coveragerc -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Generating coverage report" "\0033" +log "Generating coverage report" coverage report -i --rcfile=.coveragerc -printf "\n%b[1;36m%s%b[0m\n" "\0033" "Generating coverage report (HTML)" "\0033" +log "Generating coverage report (HTML)" coverage html -i --rcfile=.coveragerc + +log "All done!!!" + +log "FINISHED-FINISHED-FINISHED-FINISHED-FINISHED-FINISHED-FINISHED-FINISHED" diff --git a/doc/man/stacd.conf.xml b/doc/man/stacd.conf.xml index 60622f6..65ee71a 100644 --- a/doc/man/stacd.conf.xml +++ b/doc/man/stacd.conf.xml @@ -378,7 +378,7 @@ entries in stacd.conf have been removed. - With <code>sticky-connections=disabled</code> (default) + With <code>sticky-connections=disabled</code> stacd immediately disconnects from a previously connected IOC if the response to a @@ -411,7 +411,7 @@ - With <code>sticky-connections=enabled</code> + With <code>sticky-connections=enabled (default)</code> stacd does not disconnect from IOCs when a DPLE is removed or a controller= diff --git a/etc/stas/stacd.conf b/etc/stas/stacd.conf index 02e7b3e..0434671 100644 --- a/etc/stas/stacd.conf +++ b/etc/stas/stacd.conf @@ -202,8 +202,8 @@ # # Type: String # Range: [disabled, enabled] -# Default: disabled -#sticky-connections=disabled +# Default: enabled +#sticky-connections=enabled [Controllers] # controller: I/O Controllers (IOC) are specified with this keyword. diff --git a/stacd.py b/stacd.py index 708e372..28cefac 100755 --- a/stacd.py +++ b/stacd.py @@ -10,14 +10,12 @@ ''' STorage Appliance Connector Daemon ''' import sys -import logging from argparse import ArgumentParser from staslib import defs -# pylint: disable=consider-using-f-string -DBUS_IDL = ''' +DBUS_IDL = f''' - + @@ -34,19 +32,16 @@ DBUS_IDL = ''' - + - + -''' % ( - defs.STACD_DBUS_NAME, - defs.STACD_DBUS_NAME, -) - +''' +# ****************************************************************************** def parse_args(conf_file: str): # pylint: disable=missing-function-docstring parser = ArgumentParser( description=f'{defs.STAC_DESCRIPTION} ({defs.STAC_ACRONYM}). Must be root to run this program.' @@ -77,6 +72,12 @@ ARGS = parse_args(defs.STACD_CONFIG_FILE) if ARGS.version: print(f'{defs.PROJECT_NAME} {defs.VERSION}') + try: + import libnvme + + print(f'libnvme {libnvme.__version__}') + except (AttributeError, ModuleNotFoundError): + pass sys.exit(0) if ARGS.idl: @@ -85,78 +86,14 @@ if ARGS.idl: sys.exit(0) -# There is a reason for having this import here and not at the top of the file. -# We want to allow running stafd with the --version and --idl options and exit -# without having to import stas. -from staslib import stas # pylint: disable=wrong-import-position - -# Before going any further, make sure the script is allowed to run. -stas.check_if_allowed_to_continue() - - -################################################################################ -# Preliminary checks have passed. Let her rip! -# pylint: disable=wrong-import-position -# pylint: disable=wrong-import-order -import json -import pathlib -import systemd.daemon -import dasbus.error -import dasbus.client.observer -import dasbus.client.proxy -from gi.repository import GLib -from staslib import conf, log, gutil, trid, udev, ctrl, service # pylint: disable=ungrouped-imports - -log.init(ARGS.syslog) - -UDEV_RULE_SUPPRESS = pathlib.Path('/run/udev/rules.d', '70-nvmf-autoconnect.rules') - - -def udev_rule_ctrl(enable): - '''@brief We add an empty udev rule to /run/udev/rules.d to suppress - nvme-cli's udev rule that is used to tell udevd to automatically - connect to I/O controller. This is to avoid race conditions between - stacd and udevd. This is configurable. See "udev-rule" in stacd.conf - for details. - ''' - if enable: - try: - UDEV_RULE_SUPPRESS.unlink() - except FileNotFoundError: - pass - else: - if not UDEV_RULE_SUPPRESS.exists(): - pathlib.Path('/run/udev/rules.d').mkdir(parents=True, exist_ok=True) - UDEV_RULE_SUPPRESS.symlink_to('/dev/null') - - # ****************************************************************************** -class Ioc(ctrl.Controller): - '''@brief This object establishes a connection to one I/O Controller.''' - - def __init__(self, root, host, tid: trid.TID): - super().__init__(root, host, tid) - - def _on_udev_remove(self, udev_obj): - '''Called when the associated nvme device (/dev/nvmeX) is removed - from the system. - ''' - super()._on_udev_remove(udev_obj) - - # Defer removal of this object to the next main loop's idle period. - GLib.idle_add(STAC.remove_controller, self) - - def _find_existing_connection(self): - return self._udev.find_nvme_ioc_device(self.tid) - - -# ****************************************************************************** -class Stac(service.Service): - '''STorage Appliance Connector (STAC)''' +if __name__ == '__main__': + import json + import logging + from staslib import log, service, stas, udev # pylint: disable=ungrouped-imports - CONF_STABILITY_SOAK_TIME_SEC = 1.5 - CONF_STABILITY_LONG_SOAK_TIME_SEC = 10 # pylint: disable=invalid-name - ADD_EVENT_SOAK_TIME_SEC = 1 + # Before going any further, make sure the script is allowed to run. + stas.check_if_allowed_to_continue() class Dbus: '''This is the DBus interface that external programs can use to @@ -205,229 +142,8 @@ class Stac(service.Service): for controller in STAC.get_controllers() ] - # ========================================================================== - def __init__(self, args): - super().__init__(args, self._reload_hdlr) - - # We don't want to apply configuration changes to nvme-cli right away. - # Often, multiple changes will occur in a short amount of time (sub-second). - # We want to wait until there are no more changes before applying them - # to the system. The following timer acts as a "soak period". Changes - # will be applied by calling self._on_config_ctrls() at the end of - # the soak period. - self._cfg_soak_tmr = gutil.GTimer(Stac.CONF_STABILITY_SOAK_TIME_SEC, self._on_config_ctrls) - self._cfg_soak_tmr.start() - - self._add_event_soak_tmr = gutil.GTimer(Stac.ADD_EVENT_SOAK_TIME_SEC, self._on_add_event_soaked) - - self._config_connections_audit() - - # Create the D-Bus instance. - self._config_dbus(Stac.Dbus(), defs.STACD_DBUS_NAME, defs.STACD_DBUS_PATH) - - # Connect to STAF D-Bus interface - self._staf = None - self._staf_watcher = dasbus.client.observer.DBusObserver(self._sysbus, defs.STAFD_DBUS_NAME) - self._staf_watcher.service_available.connect(self._connect_to_staf) - self._staf_watcher.service_unavailable.connect(self._disconnect_from_staf) - self._staf_watcher.connect_once_available() - - # Suppress udev rule to auto-connect when AEN is received. - udev_rule_ctrl(conf.SvcConf().udev_rule_enabled) - - def _release_resources(self): - logging.debug('Stac._release_resources()') - - if self._add_event_soak_tmr: - self._add_event_soak_tmr.kill() - - udev_rule_ctrl(True) - - if self._udev: - self._udev.unregister_for_action_events('add') - - self._destroy_staf_comlink(self._staf_watcher) - if self._staf_watcher is not None: - self._staf_watcher.disconnect() - - super()._release_resources() - - self._staf = None - self._staf_watcher = None - self._add_event_soak_tmr = None - - def _audit_connections(self, tids): - '''A host should only connect to I/O controllers that have been zoned - for that host or a manual "controller" entry exists in stcd.conf. - A host should disconnect from an I/O controller when that I/O controller - is removed from the zone or a manual "controller" entry is removed from - stacd.conf. stacd will audit connections if "sticky-connections=disabled". - stacd will delete any connection that is not supposed to exist. - ''' - logging.debug('Stac._audit_connections() - tids = %s', tids) - num_controllers = len(self._controllers) - for tid in tids: - if tid not in self._controllers: - self._controllers[tid] = Ioc(self._root, self._host, tid) - - if num_controllers != len(self._controllers): - self._cfg_soak_tmr.start(Stac.CONF_STABILITY_SOAK_TIME_SEC) - - def _on_add_event(self, udev_obj): # pylint: disable=unused-argument - '''@brief This function is called when a "add" event is received from - the kernel for an NVMe device. This is used to trigger an audit and make - sure that the connection to an I/O controller is allowed. - - WARNING: There is a race condition with the "add" event from the kernel. - The kernel sends the "add" event a bit early and the sysfs attributes - associated with the nvme object are not always fully initialized. - To workaround this problem we use a soaking timer to give time for the - sysfs attributes to stabilize. - ''' - self._add_event_soak_tmr.start() - - def _on_add_event_soaked(self): - '''@brief After the add event has been soaking for ADD_EVENT_SOAK_TIME_SEC - seconds, we can audit the connections. - ''' - if not conf.SvcConf().sticky_connections: - self._audit_connections(self._udev.get_nvme_ioc_tids()) - return GLib.SOURCE_REMOVE - - def _config_connections_audit(self): - '''This function checks the "sticky_connections" parameter to determine - whether audits should be performed. Audits are enabled when - "sticky_connections" is disabled. - ''' - if not conf.SvcConf().sticky_connections: - if self._udev.get_registered_action_cback('add') is None: - self._udev.register_for_action_events('add', self._on_add_event) - self._audit_connections(self._udev.get_nvme_ioc_tids()) - else: - self._udev.unregister_for_action_events('add') - - def _keep_connections_on_exit(self): - '''@brief Determine whether connections should remain when the - process exits. - ''' - return True - - def _reload_hdlr(self): - '''@brief Reload configuration file. This is triggered by the SIGHUP - signal, which can be sent with "systemctl reload stacd". - ''' - systemd.daemon.notify('RELOADING=1') - service_cnf = conf.SvcConf() - service_cnf.reload() - self.tron = service_cnf.tron - self._config_connections_audit() - self._cfg_soak_tmr.start(Stac.CONF_STABILITY_SOAK_TIME_SEC) - udev_rule_ctrl(service_cnf.udev_rule_enabled) - systemd.daemon.notify('READY=1') - return GLib.SOURCE_CONTINUE - - def _get_log_pages_from_stafd(self): - if self._staf: - try: - return json.loads(self._staf.get_all_log_pages(True)) - except dasbus.error.DBusError: - pass - - return list() - - def _config_ctrls_finish(self, configured_ctrl_list): - configured_ctrl_list = [ - ctrl_dict for ctrl_dict in configured_ctrl_list if 'traddr' in ctrl_dict and 'subsysnqn' in ctrl_dict - ] - logging.debug('Stac._config_ctrls_finish() - configured_ctrl_list = %s', configured_ctrl_list) - - discovered_ctrl_list = list() - for staf_data in self._get_log_pages_from_stafd(): - host_traddr = staf_data['discovery-controller']['host-traddr'] - host_iface = staf_data['discovery-controller']['host-iface'] - for dlpe in staf_data['log-pages']: - if dlpe.get('subtype') == 'nvme': # eliminate discovery controllers - discovered_ctrl_list.append(stas.cid_from_dlpe(dlpe, host_traddr, host_iface)) - - logging.debug('Stac._config_ctrls_finish() - discovered_ctrl_list = %s', discovered_ctrl_list) - - controllers = stas.remove_blacklisted(configured_ctrl_list + discovered_ctrl_list) - controllers = stas.remove_invalid_addresses(controllers) - - new_controller_ids = {trid.TID(controller) for controller in controllers} - cur_controller_ids = set(self._controllers.keys()) - controllers_to_add = new_controller_ids - cur_controller_ids - controllers_to_del = cur_controller_ids - new_controller_ids - - logging.debug('Stac._config_ctrls_finish() - controllers_to_add = %s', list(controllers_to_add)) - logging.debug('Stac._config_ctrls_finish() - controllers_to_del = %s', list(controllers_to_del)) - - for tid in controllers_to_del: - controller = self._controllers.pop(tid, None) - if controller is not None: - controller.disconnect(self.remove_controller, conf.SvcConf().sticky_connections) - - for tid in controllers_to_add: - self._controllers[tid] = Ioc(self._root, self._host, tid) - - def _connect_to_staf(self, _): - '''@brief Hook up DBus signal handlers for signals from stafd.''' - try: - self._staf = self._sysbus.get_proxy(defs.STAFD_DBUS_NAME, defs.STAFD_DBUS_PATH) - self._staf.log_pages_changed.connect(self._log_pages_changed) - self._cfg_soak_tmr.start() - - # Make sure timer is set back to its normal value. - self._cfg_soak_tmr.set_timeout(Stac.CONF_STABILITY_SOAK_TIME_SEC) - logging.debug('Stac._connect_to_staf() - Connected to staf') - except dasbus.error.DBusError: - logging.error('Failed to connect to staf') - - def _destroy_staf_comlink(self, watcher): # pylint: disable=unused-argument - if self._staf: - self._staf.log_pages_changed.disconnect(self._log_pages_changed) - dasbus.client.proxy.disconnect_proxy(self._staf) - self._staf = None - - def _disconnect_from_staf(self, watcher): - self._destroy_staf_comlink(watcher) - - # When we lose connectivity with stafd, the most logical explanation - # is that stafd restarted. In that case, it may take some time for stafd - # to re-populate its log pages cache. So let's give stafd plenty of time - # to update its log pages cache and send log pages change notifications - # before triggering a stacd re-config. We do this by momentarily - # increasing the config soak timer to a longer period. - if self._cfg_soak_tmr: - self._cfg_soak_tmr.set_timeout(Stac.CONF_STABILITY_LONG_SOAK_TIME_SEC) - - logging.debug('Stac._disconnect_from_staf() - Disconnected from staf') - - def _log_pages_changed( # pylint: disable=too-many-arguments - self, transport, traddr, trsvcid, host_traddr, host_iface, subsysnqn, device - ): - logging.debug( - 'Stac._log_pages_changed() - transport=%s, traddr=%s, trsvcid=%s, host_traddr=%s, host_iface=%s, subsysnqn=%s, device=%s', - transport, - traddr, - trsvcid, - host_traddr, - host_iface, - subsysnqn, - device, - ) - self._cfg_soak_tmr.start(Stac.CONF_STABILITY_SOAK_TIME_SEC) - - def _load_last_known_config(self): - return dict() - - def _dump_last_known_config(self, controllers): - pass - - -# ****************************************************************************** -if __name__ == '__main__': - STAC = Stac(ARGS) + log.init(ARGS.syslog) + STAC = service.Stac(ARGS, Dbus()) STAC.run() STAC = None diff --git a/stafd.py b/stafd.py index aff64fd..8a77c51 100755 --- a/stafd.py +++ b/stafd.py @@ -10,14 +10,12 @@ ''' STorage Appliance Finder Daemon ''' import sys -import logging from argparse import ArgumentParser from staslib import defs -# pylint: disable=consider-using-f-string -DBUS_IDL = ''' +DBUS_IDL = f''' - + @@ -34,10 +32,10 @@ DBUS_IDL = ''' - + - + @@ -46,7 +44,7 @@ DBUS_IDL = ''' - + @@ -63,12 +61,10 @@ DBUS_IDL = ''' -''' % ( - defs.STAFD_DBUS_NAME, - defs.STAFD_DBUS_NAME, -) +''' +# ****************************************************************************** def parse_args(conf_file: str): # pylint: disable=missing-function-docstring parser = ArgumentParser( description=f'{defs.STAF_DESCRIPTION} ({defs.STAF_ACRONYM}). Must be root to run this program.' @@ -99,6 +95,12 @@ ARGS = parse_args(defs.STAFD_CONFIG_FILE) if ARGS.version: print(f'{defs.PROJECT_NAME} {defs.VERSION}') + try: + import libnvme + + print(f'libnvme {libnvme.__version__}') + except (AttributeError, ModuleNotFoundError): + pass sys.exit(0) if ARGS.idl: @@ -107,250 +109,15 @@ if ARGS.idl: sys.exit(0) -# There is a reason for having this import here and not at the top of the file. -# We want to allow running stafd with the --version and --idl options and exit -# without having to import stas and avahi. -from staslib import stas, avahi # pylint: disable=wrong-import-position - -# Before going any further, make sure the script is allowed to run. -stas.check_if_allowed_to_continue() - - -################################################################################ -# Preliminary checks have passed. Let her rip! -# pylint: disable=wrong-import-position -# pylint: disable=wrong-import-order -import json -import pickle -import dasbus.server.interface -import systemd.daemon -from libnvme import nvme -from gi.repository import GLib -from staslib import conf, log, gutil, trid, udev, ctrl, service # pylint: disable=ungrouped-imports - -log.init(ARGS.syslog) - -DLP_CHANGED = ( - (nvme.NVME_LOG_LID_DISCOVER << 16) | (nvme.NVME_AER_NOTICE_DISC_CHANGED << 8) | nvme.NVME_AER_NOTICE -) # 0x70f002 - - # ****************************************************************************** -class Dc(ctrl.Controller): - '''@brief This object establishes a connection to one Discover Controller (DC). - It retrieves the discovery log pages and caches them. - It also monitors udev events associated with that DC and updates - the cached discovery log pages accordingly. - ''' - - GET_LOG_PAGE_RETRY_RERIOD_SEC = 20 - REGISTRATION_RETRY_RERIOD_SEC = 10 - - def __init__(self, root, host, tid: trid.TID, log_pages=None): - super().__init__(root, host, tid, discovery_ctrl=True) - self._register_op = None - self._get_log_op = None - self._log_pages = log_pages if log_pages else list() # Log pages cache - - def _release_resources(self): - logging.debug('Dc._release_resources() - %s | %s', self.id, self.device) - super()._release_resources() - self._log_pages = list() - - def _kill_ops(self): - super()._kill_ops() - if self._get_log_op: - self._get_log_op.kill() - self._get_log_op = None - if self._register_op: - self._register_op.kill() - self._register_op = None - - def info(self) -> dict: - '''@brief Get the controller info for this object''' - info = super().info() - if self._get_log_op: - info['get log page operation'] = self._get_log_op.as_dict() - if self._register_op: - info['register operation'] = self._register_op.as_dict() - return info - - def cancel(self): - '''@brief Used to cancel pending operations.''' - super().cancel() - if self._get_log_op: - self._get_log_op.cancel() - if self._register_op: - self._register_op.cancel() - - def log_pages(self) -> list: - '''@brief Get the cached log pages for this object''' - return self._log_pages - - def referrals(self) -> list: - '''@brief Return the list of referrals''' - return [page for page in self._log_pages if page['subtype'] == 'referral'] - - def _on_aen(self, aen: int): - super()._on_aen(aen) - if aen == DLP_CHANGED and self._get_log_op: - self._get_log_op.run_async() - - def _on_nvme_event(self, nvme_event: str): - super()._on_nvme_event(nvme_event) - if nvme_event == 'connected' and self._register_op: - self._register_op.run_async() - - def _on_udev_remove(self, udev_obj): - super()._on_udev_remove(udev_obj) - if self._try_to_connect_deferred: - self._try_to_connect_deferred.schedule() - - def _find_existing_connection(self): - return self._udev.find_nvme_dc_device(self.tid) - - # -------------------------------------------------------------------------- - def _on_connect_success(self, op_obj, data): - '''@brief Function called when we successfully connect to the - Discovery Controller. - ''' - super()._on_connect_success(op_obj, data) - - if self._alive(): - if self._ctrl.is_registration_supported(): - self._register_op = gutil.AsyncOperationWithRetry( - self._on_registration_success, - self._on_registration_fail, - self._ctrl.registration_ctlr, - nvme.NVMF_DIM_TAS_REGISTER, - ) - self._register_op.run_async() - else: - self._get_log_op = gutil.AsyncOperationWithRetry( - self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover - ) - self._get_log_op.run_async() - - # -------------------------------------------------------------------------- - def _on_registration_success(self, op_obj, data): # pylint: disable=unused-argument - '''@brief Function called when we successfully register with the - Discovery Controller. See self._register_op object - for details. - ''' - if self._alive(): - if data is not None: - logging.warning('%s | %s - Registration error. %s.', self.id, self.device, data) - else: - logging.debug('Dc._on_registration_success() - %s | %s', self.id, self.device) - self._get_log_op = gutil.AsyncOperationWithRetry( - self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover - ) - self._get_log_op.run_async() - else: - logging.debug( - 'Dc._on_registration_success() - %s | %s Received event on dead object.', self.id, self.device - ) - - def _on_registration_fail(self, op_obj, err, fail_cnt): - '''@brief Function called when we fail to register with the - Discovery Controller. See self._register_op object - for details. - ''' - if self._alive(): - logging.debug( - 'Dc._on_registration_fail() - %s | %s: %s. Retry in %s sec', - self.id, - self.device, - err, - Dc.REGISTRATION_RETRY_RERIOD_SEC, - ) - if fail_cnt == 1: # Throttle the logs. Only print the first time we fail to connect - logging.error('%s | %s - Failed to register with Discovery Controller. %s', self.id, self.device, err) - # op_obj.retry(Dc.REGISTRATION_RETRY_RERIOD_SEC) - else: - logging.debug( - 'Dc._on_registration_fail() - %s | %s Received event on dead object. %s', - self.id, - self.device, - err, - ) - op_obj.kill() - - # -------------------------------------------------------------------------- - def _on_get_log_success(self, op_obj, data): # pylint: disable=unused-argument - '''@brief Function called when we successfully retrieve the log pages - from the Discovery Controller. See self._get_log_op object - for details. - ''' - if self._alive(): - # Note that for historical reasons too long to explain, the CDC may - # return invalid addresses ("0.0.0.0", "::", or ""). Those need to be - # filtered out. - referrals_before = self.referrals() - self._log_pages = ( - [ - {k: str(v) for k, v in dictionary.items()} - for dictionary in data - if dictionary.get('traddr') not in ('0.0.0.0', '::', '') - ] - if data - else list() - ) - logging.info( - '%s | %s - Received discovery log pages (num records=%s).', self.id, self.device, len(self._log_pages) - ) - referrals_after = self.referrals() - STAF.log_pages_changed(self, self.device) - if referrals_after != referrals_before: - logging.debug( - 'Dc._on_get_log_success() - %s | %s Referrals before = %s', - self.id, - self.device, - referrals_before, - ) - logging.debug( - 'Dc._on_get_log_success() - %s | %s Referrals after = %s', - self.id, - self.device, - referrals_after, - ) - STAF.referrals_changed() - else: - logging.debug( - 'Dc._on_get_log_success() - %s | %s Received event on dead object.', self.id, self.device - ) - - def _on_get_log_fail(self, op_obj, err, fail_cnt): - '''@brief Function called when we fail to retrieve the log pages - from the Discovery Controller. See self._get_log_op object - for details. - ''' - if self._alive(): - logging.debug( - 'Dc._on_get_log_fail() - %s | %s: %s. Retry in %s sec', - self.id, - self.device, - err, - Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC, - ) - if fail_cnt == 1: # Throttle the logs. Only print the first time we fail to connect - logging.error('%s | %s - Failed to retrieve log pages. %s', self.id, self.device, err) - op_obj.retry(Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC) - else: - logging.debug( - 'Dc._on_get_log_fail() - %s | %s Received event on dead object. %s', - self.id, - self.device, - err, - ) - op_obj.kill() - - -# ****************************************************************************** -class Staf(service.Service): - '''STorage Appliance Finder (STAF)''' +if __name__ == '__main__': + import json + import logging + import dasbus.server.interface + from staslib import log, service, stas, udev # pylint: disable=ungrouped-imports - CONF_STABILITY_SOAK_TIME_SEC = 1.5 + # Before going any further, make sure the script is allowed to run. + stas.check_if_allowed_to_continue() class Dbus: '''This is the DBus interface that external programs can use to @@ -431,148 +198,8 @@ class Staf(service.Service): for controller in STAF.get_controllers() ] - # ========================================================================== - def __init__(self, args): - super().__init__(args, self._reload_hdlr) - - self._avahi = avahi.Avahi(self._sysbus, self._avahi_change) - self._avahi.config_stypes(conf.SvcConf().get_stypes()) - - # We don't want to apply configuration changes to nvme-cli right away. - # Often, multiple changes will occur in a short amount of time (sub-second). - # We want to wait until there are no more changes before applying them - # to the system. The following timer acts as a "soak period". Changes - # will be applied by calling self._on_config_ctrls() at the end of - # the soak period. - self._cfg_soak_tmr = gutil.GTimer(Staf.CONF_STABILITY_SOAK_TIME_SEC, self._on_config_ctrls) - self._cfg_soak_tmr.start() - - # Create the D-Bus instance. - self._config_dbus(Staf.Dbus(), defs.STAFD_DBUS_NAME, defs.STAFD_DBUS_PATH) - - def info(self) -> dict: - '''@brief Get the status info for this object (used for debug)''' - info = super().info() - info['avahi'] = self._avahi.info() - return info - - def _release_resources(self): - logging.debug('Staf._release_resources()') - super()._release_resources() - if self._avahi: - self._avahi.kill() - self._avahi = None - - def _load_last_known_config(self): - try: - with open(self._lkc_file, 'rb') as file: - config = pickle.load(file) - except (FileNotFoundError, AttributeError): - return dict() - - logging.debug('Staf._load_last_known_config() - DC count = %s', len(config)) - return {tid: Dc(self._root, self._host, tid, log_pages) for tid, log_pages in config.items()} - - def _dump_last_known_config(self, controllers): - try: - with open(self._lkc_file, 'wb') as file: - config = {tid: dc.log_pages() for tid, dc in controllers.items()} - logging.debug('Staf._dump_last_known_config() - DC count = %s', len(config)) - pickle.dump(config, file) - except FileNotFoundError as ex: - logging.error('Unable to save last known config: %s', ex) - - def _keep_connections_on_exit(self): - '''@brief Determine whether connections should remain when the - process exits. - ''' - return conf.SvcConf().persistent_connections - - def _reload_hdlr(self): - '''@brief Reload configuration file. This is triggered by the SIGHUP - signal, which can be sent with "systemctl reload stafd". - ''' - systemd.daemon.notify('RELOADING=1') - service_cnf = conf.SvcConf() - service_cnf.reload() - self.tron = service_cnf.tron - self._avahi.kick_start() # Make sure Avahi is running - self._avahi.config_stypes(service_cnf.get_stypes()) - self._cfg_soak_tmr.start() - systemd.daemon.notify('READY=1') - return GLib.SOURCE_CONTINUE - - def log_pages_changed(self, controller, device): - '''@brief Function invoked when a controller's cached log pages - have changed. This will emit a D-Bus signal to inform - other applications that the cached log pages have changed. - ''' - self._dbus_iface.log_pages_changed.emit( - controller.tid.transport, - controller.tid.traddr, - controller.tid.trsvcid, - controller.tid.host_traddr, - controller.tid.host_iface, - controller.tid.subsysnqn, - device, - ) - - def referrals_changed(self): - '''@brief Function invoked when a controller's cached referrals - have changed. - ''' - logging.debug('Staf.referrals_changed()') - self._cfg_soak_tmr.start() - - def _referrals(self) -> list: - return [ - stas.cid_from_dlpe(dlpe, controller.tid.host_traddr, controller.tid.host_iface) - for controller in self.get_controllers() - for dlpe in controller.referrals() - ] - - def _config_ctrls_finish(self, configured_ctrl_list): - '''@brief Finish discovery controllers configuration after - hostnames (if any) have been resolved. - ''' - configured_ctrl_list = [ - ctrl_dict - for ctrl_dict in configured_ctrl_list - if 'traddr' in ctrl_dict and ctrl_dict.setdefault('subsysnqn', defs.WELL_KNOWN_DISC_NQN) - ] - - discovered_ctrl_list = self._avahi.get_controllers() - referral_ctrl_list = self._referrals() - logging.debug('Staf._config_ctrls_finish() - configured_ctrl_list = %s', configured_ctrl_list) - logging.debug('Staf._config_ctrls_finish() - discovered_ctrl_list = %s', discovered_ctrl_list) - logging.debug('Staf._config_ctrls_finish() - referral_ctrl_list = %s', referral_ctrl_list) - - controllers = stas.remove_blacklisted(configured_ctrl_list + discovered_ctrl_list + referral_ctrl_list) - controllers = stas.remove_invalid_addresses(controllers) - - new_controller_ids = {trid.TID(controller) for controller in controllers} - cur_controller_ids = set(self._controllers.keys()) - controllers_to_add = new_controller_ids - cur_controller_ids - controllers_to_del = cur_controller_ids - new_controller_ids - - logging.debug('Staf._config_ctrls_finish() - controllers_to_add = %s', list(controllers_to_add)) - logging.debug('Staf._config_ctrls_finish() - controllers_to_del = %s', list(controllers_to_del)) - - for tid in controllers_to_del: - controller = self._controllers.pop(tid, None) - if controller is not None: - controller.disconnect(self.remove_controller, conf.SvcConf().persistent_connections) - - for tid in controllers_to_add: - self._controllers[tid] = Dc(self._root, self._host, tid) - - def _avahi_change(self): - self._cfg_soak_tmr.start() - - -# ****************************************************************************** -if __name__ == '__main__': - STAF = Staf(ARGS) + log.init(ARGS.syslog) + STAF = service.Staf(ARGS, Dbus()) STAF.run() STAF = None diff --git a/staslib/avahi.py b/staslib/avahi.py index 768bbf4..90a67c8 100644 --- a/staslib/avahi.py +++ b/staslib/avahi.py @@ -172,9 +172,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes services = dict() for service, obj in self._services.items(): interface, protocol, name, stype, domain = service - key = '({}, {}, {}.{}, {})'.format( # pylint: disable=consider-using-f-string - socket.if_indextoname(interface), Avahi.protos.get(protocol, 'unknown'), name, domain, stype - ) + key = f'({socket.if_indextoname(interface)}, {Avahi.protos.get(protocol, "unknown")}, {name}.{domain}, {stype})' services[key] = obj.get('data', {}) info = { @@ -316,7 +314,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes _interface_name: str, _signal_name: str, args: typing.Tuple[int, int, str, str, str, int], - *_user_data + *_user_data, ): (interface, protocol, name, stype, domain, flags) = args logging.debug( @@ -352,7 +350,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes _interface_name: str, _signal_name: str, args: typing.Tuple[int, int, str, str, str, int], - *_user_data + *_user_data, ): (interface, protocol, name, stype, domain, flags) = args logging.debug( @@ -386,7 +384,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes _interface_name: str, _signal_name: str, args: typing.Tuple[int, int, str, str, str, str, int, str, int, list, int], - *_user_data + *_user_data, ): (interface, protocol, name, stype, domain, host, aprotocol, address, port, txt, flags) = args txt = _txt2dict(txt) @@ -428,7 +426,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes interface_name: str, _signal_name: str, args: typing.Tuple[str], - *_user_data + *_user_data, ): (error,) = args if 'ServiceResolver' not in interface_name or 'TimeoutError' not in error: diff --git a/staslib/conf.py b/staslib/conf.py index 3f52e4f..c314a9e 100644 --- a/staslib/conf.py +++ b/staslib/conf.py @@ -74,7 +74,7 @@ class SvcConf(metaclass=singleton.Singleton): ('Global', 'ignore-iface'): 'false', ('Global', 'ip-family'): 'ipv4+ipv6', ('Global', 'udev-rule'): 'enabled', - ('Global', 'sticky-connections'): 'disabled', + ('Global', 'sticky-connections'): 'enabled', ('Service Discovery', 'zeroconf'): 'enabled', ('Controllers', 'controller'): list(), ('Controllers', 'blacklist'): list(), diff --git a/staslib/ctrl.py b/staslib/ctrl.py index 5504baa..dbc1973 100644 --- a/staslib/ctrl.py +++ b/staslib/ctrl.py @@ -10,69 +10,76 @@ Dc (Discovery Controller) and Ioc (I/O Controller) objects are derived.''' import logging -from gi.repository import Gio, GLib +from gi.repository import GLib from libnvme import nvme -from staslib import conf, gutil, trid, udev +from staslib import conf, gutil, trid, udev, stas DC_KATO_DEFAULT = 30 # seconds # ****************************************************************************** -class Controller: # pylint: disable=too-many-instance-attributes +class Controller(stas.ControllerABC): '''@brief Base class used to manage the connection to a controller.''' - CONNECT_RETRY_PERIOD_SEC = 60 - FAST_CONNECT_RETRY_PERIOD_SEC = 3 - def __init__(self, root, host, tid: trid.TID, discovery_ctrl=False): - self._root = root - self._host = host - self._udev = udev.UDEV - self._tid = tid - self._cancellable = Gio.Cancellable() - self._connect_op = None - self._connect_attempts = 0 - self._retry_connect_tmr = gutil.GTimer(Controller.CONNECT_RETRY_PERIOD_SEC, self._on_try_to_connect) - self._device = None - self._ctrl = None - self._discovery_ctrl = discovery_ctrl - self._try_to_connect_deferred = gutil.Deferred(self._try_to_connect) - self._try_to_connect_deferred.schedule() + self._udev = udev.UDEV + self._device = None # Refers to the nvme device (e.g. /dev/nvme[n]) + self._ctrl = None # libnvme's nvme.ctrl object + self._connect_op = None + + super().__init__(root, host, tid, discovery_ctrl) def _release_resources(self): logging.debug('Controller._release_resources() - %s', self.id) - # Remove pending deferred from main loop - if self._try_to_connect_deferred: - self._try_to_connect_deferred.cancel() - self._try_to_connect_deferred = None - if self._udev: self._udev.unregister_for_device_events(self._on_udev_notification) - if self._retry_connect_tmr is not None: - self._retry_connect_tmr.kill() - - if self._cancellable and not self._cancellable.is_cancelled(): - self._cancellable.cancel() - self._kill_ops() - self._tid = None + super()._release_resources() + self._ctrl = None - self._device = None - self._retry_connect_tmr = None - self._cancellable = None self._udev = None - def _alive(self): - '''There may be race condition where a queued event gets processed - after the object is no longer configured (i.e. alive). This method - can be used by callback functions to make sure the object is still - alive before processing further. - ''' - return self._cancellable and not self._cancellable.is_cancelled() + @property + def device(self) -> str: + '''@brief return the Linux nvme device id (e.g. nvme3) or empty + string if no device is associated with this controller''' + if not self._device and self._ctrl and self._ctrl.name: + self._device = self._ctrl.name + + return self._device or 'nvme?' + + def controller_id_dict(self) -> dict: + '''@brief return the controller ID as a dict.''' + cid = super().controller_id_dict() + cid['device'] = self.device + return cid + + def details(self) -> dict: + '''@brief return detailed debug info about this controller''' + details = super().details() + details.update( + self._udev.get_attributes(self.device, + ('hostid', 'hostnqn', 'model', + 'serial', 'dctype', 'cntrltype')) + ) + return details + + def info(self) -> dict: + '''@brief Get the controller info for this object''' + info = super().info() + if self._connect_op: + info['connect operation'] = self._connect_op.as_dict() + return info + + def cancel(self): + '''@brief Used to cancel pending operations.''' + super().cancel() + if self._connect_op: + self._connect_op.cancel() def _kill_ops(self): if self._connect_op: @@ -91,7 +98,7 @@ class Controller: # pylint: disable=too-many-instance-attributes self._on_nvme_event(nvme_event) elif udev_obj.action == 'remove': logging.info('%s | %s - Received "remove" event', self.id, udev_obj.sys_name) - self._on_udev_remove(udev_obj) + self._on_ctrl_removed(udev_obj) else: logging.debug( 'Controller._on_udev_notification() - %s | %s - Received "%s" notification.', @@ -108,33 +115,12 @@ class Controller: # pylint: disable=too-many-instance-attributes udev_obj.sys_name, ) - def _on_aen(self, aen: int): - pass - - def _on_nvme_event(self, nvme_event): - pass - - def _on_udev_remove(self, udev_obj): # pylint: disable=unused-argument + def _on_ctrl_removed(self, obj): # pylint: disable=unused-argument self._udev.unregister_for_device_events(self._on_udev_notification) self._kill_ops() # Kill all pending operations self._ctrl = None - def _find_existing_connection(self): - raise NotImplementedError() - - def _on_try_to_connect(self): - self._try_to_connect_deferred.schedule() - return GLib.SOURCE_REMOVE - - def _try_to_connect(self): - # This is a deferred function call. Make sure - # the source of the deferred is still good. - source = GLib.main_current_source() - if source and source.is_destroyed(): - return - - self._connect_attempts += 1 - + def _do_connect(self): host_iface = ( self.tid.host_iface if (self.tid.host_iface and not conf.SvcConf().ignore_iface and conf.NvmeOptions().host_iface_supp) @@ -164,7 +150,6 @@ class Controller: # pylint: disable=too-many-instance-attributes self._on_connect_success, self._on_connect_fail, self._ctrl.init, self._host, int(udev_obj.sys_number) ) else: - self._device = None service_conf = conf.SvcConf() cfg = { 'hdr_digest': service_conf.hdr_digest, 'data_digest': service_conf.data_digest } @@ -198,11 +183,10 @@ class Controller: # pylint: disable=too-many-instance-attributes self._connect_op = None if self._alive(): - if not self._device: - self._device = self._ctrl.name + self._device = self._ctrl.name logging.info('%s | %s - Connection established!', self.id, self.device) self._connect_attempts = 0 - self._udev.register_for_device_events(self.device, self._on_udev_notification) + self._udev.register_for_device_events(self._device, self._on_udev_notification) else: logging.debug( 'Controller._on_connect_success() - %s | %s Received event on dead object. data=%s', @@ -227,11 +211,11 @@ class Controller: # pylint: disable=too-many-instance-attributes # the same time. This is perfectly fine, except that we may get a bogus # failed to connect error. By doing a fast re-try, stacd can quickly # verify that the connection was actually successful. - self._retry_connect_tmr.set_timeout(Controller.FAST_CONNECT_RETRY_PERIOD_SEC) + self._retry_connect_tmr.set_timeout(self.FAST_CONNECT_RETRY_PERIOD_SEC) elif self._connect_attempts == 2: # If the fast connect re-try fails, then we can print a message to # indicate the failure, and start a slow re-try period. - self._retry_connect_tmr.set_timeout(Controller.CONNECT_RETRY_PERIOD_SEC) + self._retry_connect_tmr.set_timeout(self.CONNECT_RETRY_PERIOD_SEC) logging.error('%s Failed to connect to controller. %s', self.id, getattr(err, 'message', err)) logging.debug( @@ -248,53 +232,6 @@ class Controller: # pylint: disable=too-many-instance-attributes getattr(err, 'message', err), ) - @property - def id(self) -> str: # pylint: disable=missing-function-docstring - return str(self.tid) - - @property - def tid(self): # pylint: disable=missing-function-docstring - return self._tid - - @property - def device(self) -> str: # pylint: disable=missing-function-docstring - return self._device if self._device else '' - - def controller_id_dict(self) -> dict: - '''@brief return the controller ID as a dict.''' - cid = self.tid.as_dict() - cid['device'] = self.device - return cid - - def details(self) -> dict: - '''@brief return detailed debug info about this controller''' - details = self.controller_id_dict() - details.update(self._udev.get_attributes(self.device, ('hostid', 'hostnqn', 'model', 'serial'))) - details['connect attempts'] = str(self._connect_attempts) - details['retry connect timer'] = str(self._retry_connect_tmr) - return details - - def info(self) -> dict: - '''@brief Get the controller info for this object''' - info = self.details() - if self._connect_op: - info['connect operation'] = self._connect_op.as_dict() - return info - - def cancel(self): - '''@brief Used to cancel pending operations.''' - if self._cancellable and not self._cancellable.is_cancelled(): - logging.debug('Controller.cancel() - %s', self.id) - self._cancellable.cancel() - - if self._connect_op: - self._connect_op.cancel() - - def kill(self): - '''@brief Used to release all resources associated with this object.''' - logging.debug('Controller.kill() - %s', self.id) - self._release_resources() - def disconnect(self, disconnected_cb, keep_connection): '''@brief Issue an asynchronous disconnect command to a Controller. Once the async command has completed, the callback 'disconnected_cb' @@ -313,7 +250,7 @@ class Controller: # pylint: disable=too-many-instance-attributes # cannot be called directly as the current Controller object is in the # process of being disconnected and the callback will in fact delete # the object. This would invariably lead to unpredictable outcome. - GLib.idle_add(disconnected_cb, self) + GLib.idle_add(disconnected_cb, self, True) def _on_disconn_success(self, op_obj, data, disconnected_cb): # pylint: disable=unused-argument logging.debug('Controller._on_disconn_success() - %s | %s', self.id, self.device) @@ -322,7 +259,7 @@ class Controller: # pylint: disable=too-many-instance-attributes # cannot be called directly as the current Controller object is in the # process of being disconnected and the callback will in fact delete # the object. This would invariably lead to unpredictable outcome. - GLib.idle_add(disconnected_cb, self) + GLib.idle_add(disconnected_cb, self, True) def _on_disconn_fail(self, op_obj, err, fail_cnt, disconnected_cb): # pylint: disable=unused-argument logging.debug('Controller._on_disconn_fail() - %s | %s: %s', self.id, self.device, err) @@ -331,4 +268,249 @@ class Controller: # pylint: disable=too-many-instance-attributes # cannot be called directly as the current Controller object is in the # process of being disconnected and the callback will in fact delete # the object. This would invariably lead to unpredictable outcome. - GLib.idle_add(disconnected_cb, self) + GLib.idle_add(disconnected_cb, self, False) + + +# ****************************************************************************** +class Dc(Controller): + '''@brief This object establishes a connection to one Discover Controller (DC). + It retrieves the discovery log pages and caches them. + It also monitors udev events associated with that DC and updates + the cached discovery log pages accordingly. + ''' + + DLP_CHANGED = ( + (nvme.NVME_LOG_LID_DISCOVER << 16) | (nvme.NVME_AER_NOTICE_DISC_CHANGED << 8) | nvme.NVME_AER_NOTICE + ) # 0x70f002 + GET_LOG_PAGE_RETRY_RERIOD_SEC = 20 + REGISTRATION_RETRY_RERIOD_SEC = 10 + + def __init__(self, staf, root, host, tid: trid.TID, log_pages=None): # pylint: disable=too-many-arguments + super().__init__(root, host, tid, discovery_ctrl=True) + self._staf = staf + self._register_op = None + self._get_log_op = None + self._log_pages = log_pages if log_pages else list() # Log pages cache + + def _release_resources(self): + logging.debug('Dc._release_resources() - %s | %s', self.id, self.device) + super()._release_resources() + self._log_pages = list() + self._staf = None + + def _kill_ops(self): + super()._kill_ops() + if self._get_log_op: + self._get_log_op.kill() + self._get_log_op = None + if self._register_op: + self._register_op.kill() + self._register_op = None + + def info(self) -> dict: + '''@brief Get the controller info for this object''' + info = super().info() + if self._get_log_op: + info['get log page operation'] = self._get_log_op.as_dict() + if self._register_op: + info['register operation'] = self._register_op.as_dict() + return info + + def cancel(self): + '''@brief Used to cancel pending operations.''' + super().cancel() + if self._get_log_op: + self._get_log_op.cancel() + if self._register_op: + self._register_op.cancel() + + def log_pages(self) -> list: + '''@brief Get the cached log pages for this object''' + return self._log_pages + + def referrals(self) -> list: + '''@brief Return the list of referrals''' + return [page for page in self._log_pages if page['subtype'] == 'referral'] + + def _on_aen(self, aen: int): + if aen == self.DLP_CHANGED and self._get_log_op: + self._get_log_op.run_async() + + def _on_nvme_event(self, nvme_event: str): + if nvme_event == 'connected' and self._register_op: + self._register_op.run_async() + + def _on_ctrl_removed(self, obj): + super()._on_ctrl_removed(obj) + if self._try_to_connect_deferred: + self._try_to_connect_deferred.schedule() + + def _find_existing_connection(self): + return self._udev.find_nvme_dc_device(self.tid) + + # -------------------------------------------------------------------------- + def _on_connect_success(self, op_obj, data): + '''@brief Function called when we successfully connect to the + Discovery Controller. + ''' + super()._on_connect_success(op_obj, data) + + if self._alive(): + if self._ctrl.is_registration_supported(): + self._register_op = gutil.AsyncOperationWithRetry( + self._on_registration_success, + self._on_registration_fail, + self._ctrl.registration_ctlr, + nvme.NVMF_DIM_TAS_REGISTER, + ) + self._register_op.run_async() + else: + self._get_log_op = gutil.AsyncOperationWithRetry( + self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover + ) + self._get_log_op.run_async() + + # -------------------------------------------------------------------------- + def _on_registration_success(self, op_obj, data): # pylint: disable=unused-argument + '''@brief Function called when we successfully register with the + Discovery Controller. See self._register_op object + for details. + ''' + if self._alive(): + if data is not None: + logging.warning('%s | %s - Registration error. %s.', self.id, self.device, data) + else: + logging.debug('Dc._on_registration_success() - %s | %s', self.id, self.device) + self._get_log_op = gutil.AsyncOperationWithRetry( + self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover + ) + self._get_log_op.run_async() + else: + logging.debug( + 'Dc._on_registration_success() - %s | %s Received event on dead object.', self.id, self.device + ) + + def _on_registration_fail(self, op_obj, err, fail_cnt): + '''@brief Function called when we fail to register with the + Discovery Controller. See self._register_op object + for details. + ''' + if self._alive(): + logging.debug( + 'Dc._on_registration_fail() - %s | %s: %s. Retry in %s sec', + self.id, + self.device, + err, + Dc.REGISTRATION_RETRY_RERIOD_SEC, + ) + if fail_cnt == 1: # Throttle the logs. Only print the first time we fail to connect + logging.error('%s | %s - Failed to register with Discovery Controller. %s', self.id, self.device, err) + # op_obj.retry(Dc.REGISTRATION_RETRY_RERIOD_SEC) + else: + logging.debug( + 'Dc._on_registration_fail() - %s | %s Received event on dead object. %s', + self.id, + self.device, + err, + ) + op_obj.kill() + + # -------------------------------------------------------------------------- + def _on_get_log_success(self, op_obj, data): # pylint: disable=unused-argument + '''@brief Function called when we successfully retrieve the log pages + from the Discovery Controller. See self._get_log_op object + for details. + ''' + if self._alive(): + # Note that for historical reasons too long to explain, the CDC may + # return invalid addresses ("0.0.0.0", "::", or ""). Those need to be + # filtered out. + referrals_before = self.referrals() + self._log_pages = ( + [ + {k: str(v) for k, v in dictionary.items()} + for dictionary in data + if dictionary.get('traddr') not in ('0.0.0.0', '::', '') + ] + if data + else list() + ) + logging.info( + '%s | %s - Received discovery log pages (num records=%s).', self.id, self.device, len(self._log_pages) + ) + referrals_after = self.referrals() + self._staf.log_pages_changed(self, self.device) + if referrals_after != referrals_before: + logging.debug( + 'Dc._on_get_log_success() - %s | %s Referrals before = %s', + self.id, + self.device, + referrals_before, + ) + logging.debug( + 'Dc._on_get_log_success() - %s | %s Referrals after = %s', + self.id, + self.device, + referrals_after, + ) + self._staf.referrals_changed() + else: + logging.debug( + 'Dc._on_get_log_success() - %s | %s Received event on dead object.', self.id, self.device + ) + + def _on_get_log_fail(self, op_obj, err, fail_cnt): + '''@brief Function called when we fail to retrieve the log pages + from the Discovery Controller. See self._get_log_op object + for details. + ''' + if self._alive(): + logging.debug( + 'Dc._on_get_log_fail() - %s | %s: %s. Retry in %s sec', + self.id, + self.device, + err, + Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC, + ) + if fail_cnt == 1: # Throttle the logs. Only print the first time we fail to connect + logging.error('%s | %s - Failed to retrieve log pages. %s', self.id, self.device, err) + op_obj.retry(Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC) + else: + logging.debug( + 'Dc._on_get_log_fail() - %s | %s Received event on dead object. %s', + self.id, + self.device, + err, + ) + op_obj.kill() + + +# ****************************************************************************** +class Ioc(Controller): + '''@brief This object establishes a connection to one I/O Controller.''' + + def __init__(self, stac, root, host, tid: trid.TID): + self._stac = stac + super().__init__(root, host, tid) + + def _release_resources(self): + super()._release_resources() + self._stac = None + + def _on_ctrl_removed(self, obj): + '''Called when the associated nvme device (/dev/nvmeX) is removed + from the system. + ''' + super()._on_ctrl_removed(obj) + + # Defer removal of this object to the next main loop's idle period. + GLib.idle_add(self._stac.remove_controller, self, True) + + def _find_existing_connection(self): + return self._udev.find_nvme_ioc_device(self.tid) + + def _on_aen(self, aen: int): + pass + + def _on_nvme_event(self, nvme_event): + pass diff --git a/staslib/gutil.py b/staslib/gutil.py index b302f3a..36ce2c7 100644 --- a/staslib/gutil.py +++ b/staslib/gutil.py @@ -104,8 +104,7 @@ class GTimer: # ****************************************************************************** -class NameResolver: - # pylint: disable=too-few-public-methods +class NameResolver: # pylint: disable=too-few-public-methods '''@brief DNS resolver to convert host names to IP addresses.''' def __init__(self): @@ -133,8 +132,10 @@ class NameResolver: else: logging.error('Cannot resolve traddr: %s', hostname) - except GLib.GError: - logging.error('Cannot resolve traddr: %s', hostname) + except GLib.GError as err: + # We don't need to report "cancellation" errors. + if not err.matches(Gio.io_error_quark(), Gio.IOErrorEnum.CANCELLED): + logging.error('Cannot resolve traddr: %s. %s', hostname, err.message) # pylint: disable=no-member logging.debug('NameResolver.resolve_ctrl_async() - resolved \'%s\' -> %s', hostname, traddr) controllers[indx]['traddr'] = traddr diff --git a/staslib/log.py b/staslib/log.py index c624978..9622e98 100644 --- a/staslib/log.py +++ b/staslib/log.py @@ -24,7 +24,7 @@ def init(syslog: bool): if syslog: try: # Try journal logger first - import systemd.journal # pylint: disable=redefined-outer-name,import-outside-toplevel + import systemd.journal # pylint: disable=import-outside-toplevel handler = systemd.journal.JournalHandler(SYSLOG_IDENTIFIER=defs.PROG_NAME) except ModuleNotFoundError: @@ -32,9 +32,7 @@ def init(syslog: bool): from logging.handlers import SysLogHandler # pylint: disable=import-outside-toplevel handler = SysLogHandler(address="/dev/log") - handler.setFormatter( - logging.Formatter('{}: %(message)s'.format(defs.PROG_NAME)) # pylint: disable=consider-using-f-string - ) + handler.setFormatter(logging.Formatter(f'{defs.PROG_NAME}: %(message)s')) else: # Log to stdout handler = logging.StreamHandler(stream=sys.stdout) diff --git a/staslib/service.py b/staslib/service.py index 556a9f9..a48e66d 100644 --- a/staslib/service.py +++ b/staslib/service.py @@ -9,248 +9,416 @@ '''This module defines the base Service object from which the Staf and the Stac objects are derived.''' -import os -import signal +import json +import pickle import logging +import pathlib import systemd.daemon -import dasbus.connection +import dasbus.error +import dasbus.client.observer +import dasbus.client.proxy -from gi.repository import Gio, GLib +from gi.repository import GLib from libnvme import nvme -from staslib import conf, ctrl, defs, gutil, log, stas, trid, udev +from staslib import avahi, conf, ctrl, defs, gutil, stas, trid, udev # ****************************************************************************** -class Service: # pylint: disable=too-many-instance-attributes +class Service(stas.ServiceABC): '''@brief Base class used to manage a STorage Appliance Service''' def __init__(self, args, reload_hdlr): - sysconf = conf.SysConf() self._root = nvme.root() self._host = nvme.host(self._root, sysconf.hostnqn, sysconf.hostid, sysconf.hostsymname) - service_conf = conf.SvcConf() - service_conf.set_conf_file(args.conf_file) # reload configuration - self._tron = args.tron or service_conf.tron - log.set_level_from_tron(self._tron) - self._root.log_level("debug" if self._tron else "err") + super().__init__(args, reload_hdlr) - self._lkc_file = os.path.join(os.environ.get('RUNTIME_DIRECTORY', os.path.join('/run', defs.PROG_NAME)), 'last-known-config.pickle') - self._loop = GLib.MainLoop() - self._udev = udev.UDEV - self._cancellable = Gio.Cancellable() - self._resolver = gutil.NameResolver() - self._controllers = self._load_last_known_config() - self._dbus_iface = None - self._cfg_soak_tmr = None - self._sysbus = dasbus.connection.SystemMessageBus() - - GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGINT, self._stop_hdlr) # CTRL-C - GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGTERM, self._stop_hdlr) # systemctl stop stafd - GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGHUP, reload_hdlr) # systemctl reload stafd - - nvme_options = conf.NvmeOptions() - if not nvme_options.host_iface_supp or not nvme_options.discovery_supp: - logging.warning( - 'Kernel does not appear to support all the options needed to run this program. Consider updating to a later kernel version.' - ) + self._root.log_level("debug" if self._tron else "err") def _release_resources(self): logging.debug('Service._release_resources()') + super()._release_resources() - if self._cancellable and not self._cancellable.is_cancelled(): - self._cancellable.cancel() + self._host = None + self._root = None - if self._cfg_soak_tmr is not None: - self._cfg_soak_tmr.kill() + @stas.ServiceABC.tron.setter + def tron(self, value): + '''@brief Set Trace ON property''' + super(__class__, self.__class__).tron.__set__(self, value) + self._root.log_level("debug" if self._tron else "err") - self._controllers.clear() - if self._sysbus: - self._sysbus.disconnect() +# ****************************************************************************** +def udev_rule_ctrl(enable): + '''@brief We add an empty udev rule to /run/udev/rules.d to suppress + nvme-cli's udev rule that is used to tell udevd to automatically + connect to I/O controller. This is to avoid race conditions between + stacd and udevd. This is configurable. See "udev-rule" in stacd.conf + for details. + ''' + udev_rule_suppress = pathlib.Path('/run/udev/rules.d', '70-nvmf-autoconnect.rules') + if enable: + try: + udev_rule_suppress.unlink() + except FileNotFoundError: + pass + else: + if not udev_rule_suppress.exists(): + pathlib.Path('/run/udev/rules.d').mkdir(parents=True, exist_ok=True) + udev_rule_suppress.symlink_to('/dev/null') - self._cfg_soak_tmr = None - self._cancellable = None - self._resolver = None - self._lkc_file = None - self._sysbus = None - self._udev = None - def _config_dbus(self, iface_obj, bus_name: str, obj_name: str): - self._dbus_iface = iface_obj - self._sysbus.publish_object(obj_name, iface_obj) - self._sysbus.register_service(bus_name) +# ****************************************************************************** +class Stac(Service): + '''STorage Appliance Connector (STAC)''' - @property - def tron(self): - '''@brief Get Trace ON property''' - return self._tron + CONF_STABILITY_LONG_SOAK_TIME_SEC = 10 # pylint: disable=invalid-name + ADD_EVENT_SOAK_TIME_SEC = 1 - @tron.setter - def tron(self, value): # pylint: disable=no-self-use - '''@brief Set Trace ON property''' - self._tron = value - log.set_level_from_tron(self._tron) - self._root.log_level("debug" if self._tron else "err") + def __init__(self, args, dbus): + super().__init__(args, self._reload_hdlr) - def run(self): - '''@brief Start the main loop execution''' - try: - self._loop.run() - except Exception as ex: # pylint: disable=broad-except - logging.critical('exception: %s', ex) + self._udev = udev.UDEV - self._loop = None + self._add_event_soak_tmr = gutil.GTimer(self.ADD_EVENT_SOAK_TIME_SEC, self._on_add_event_soaked) - def info(self) -> dict: - '''@brief Get the status info for this object (used for debug)''' - nvme_options = conf.NvmeOptions() - return { - 'last known config file': self._lkc_file, - 'config soak timer': str(self._cfg_soak_tmr), - 'kernel support': { - 'TP8013': nvme_options.discovery_supp, - 'host_iface': nvme_options.host_iface_supp, - }, - 'system config': conf.SysConf().as_dict(), - } - - def get_controllers(self): - '''@brief return the list of controller objects''' - return self._controllers.values() - - def get_controller( - self, transport: str, traddr: str, trsvcid: str, host_traddr: str, host_iface: str, subsysnqn: str - ): # pylint: disable=too-many-arguments - '''@brief get the specified controller object from the list of controllers''' - cid = { - 'transport': transport, - 'traddr': traddr, - 'trsvcid': trsvcid, - 'host-traddr': host_traddr, - 'host-iface': host_iface, - 'subsysnqn': subsysnqn, - } - return self._controllers.get(trid.TID(cid)) - - def _remove_ctrl_from_dict(self, controller): - tid_to_pop = controller.tid - if not tid_to_pop: - # Being paranoid. This should not happen, but let's say the - # controller object has been purged, but it is somehow still - # listed in self._controllers. - for tid, _controller in self._controllers.items(): - if _controller is controller: - tid_to_pop = tid - break - - if tid_to_pop: - logging.debug('Service._remove_ctrl_from_dict() - %s | %s', tid_to_pop, controller.device) - self._controllers.pop(tid_to_pop, None) - else: - logging.debug('Service._remove_ctrl_from_dict() - already removed') + self._config_connections_audit() - def remove_controller(self, controller): - '''@brief remove the specified controller object from the list of controllers''' - logging.debug('Service.remove_controller()') - if isinstance(controller, ctrl.Controller): - self._remove_ctrl_from_dict(controller) + # Create the D-Bus instance. + self._config_dbus(dbus, defs.STACD_DBUS_NAME, defs.STACD_DBUS_PATH) - controller.kill() + # Connect to STAF D-Bus interface + self._staf = None + self._staf_watcher = dasbus.client.observer.DBusObserver(self._sysbus, defs.STAFD_DBUS_NAME) + self._staf_watcher.service_available.connect(self._connect_to_staf) + self._staf_watcher.service_unavailable.connect(self._disconnect_from_staf) + self._staf_watcher.connect_once_available() - if self._cfg_soak_tmr: - self._cfg_soak_tmr.start() + # Suppress udev rule to auto-connect when AEN is received. + udev_rule_ctrl(conf.SvcConf().udev_rule_enabled) - def _cancel(self): - logging.debug('Service._cancel()') - if not self._cancellable.is_cancelled(): - self._cancellable.cancel() + def _release_resources(self): + logging.debug('Stac._release_resources()') + + if self._add_event_soak_tmr: + self._add_event_soak_tmr.kill() + + udev_rule_ctrl(True) + + if self._udev: + self._udev.unregister_for_action_events('add') + + self._destroy_staf_comlink(self._staf_watcher) + if self._staf_watcher is not None: + self._staf_watcher.disconnect() - for controller in self._controllers.values(): - controller.cancel() + super()._release_resources() + + self._udev = None + self._staf = None + self._staf_watcher = None + self._add_event_soak_tmr = None + + def _audit_connections(self, tids): + '''A host should only connect to I/O controllers that have been zoned + for that host or a manual "controller" entry exists in stcd.conf. + A host should disconnect from an I/O controller when that I/O controller + is removed from the zone or a manual "controller" entry is removed from + stacd.conf. stacd will audit connections if "sticky-connections=disabled". + stacd will delete any connection that is not supposed to exist. + ''' + logging.debug('Stac._audit_connections() - tids = %s', tids) + num_controllers = len(self._controllers) + for tid in tids: + if tid not in self._controllers: + self._controllers[tid] = ctrl.Ioc(self, self._root, self._host, tid) + + if num_controllers != len(self._controllers): + self._cfg_soak_tmr.start(self.CONF_STABILITY_SOAK_TIME_SEC) + + def _on_add_event(self, udev_obj): # pylint: disable=unused-argument + '''@brief This function is called when a "add" event is received from + the kernel for an NVMe device. This is used to trigger an audit and make + sure that the connection to an I/O controller is allowed. + + WARNING: There is a race condition with the "add" event from the kernel. + The kernel sends the "add" event a bit early and the sysfs attributes + associated with the nvme object are not always fully initialized. + To workaround this problem we use a soaking timer to give time for the + sysfs attributes to stabilize. + ''' + self._add_event_soak_tmr.start() + + def _on_add_event_soaked(self): + '''@brief After the add event has been soaking for ADD_EVENT_SOAK_TIME_SEC + seconds, we can audit the connections. + ''' + if not conf.SvcConf().sticky_connections: + self._audit_connections(self._udev.get_nvme_ioc_tids()) + return GLib.SOURCE_REMOVE + + def _config_connections_audit(self): + '''This function checks the "sticky_connections" parameter to determine + whether audits should be performed. Audits are enabled when + "sticky_connections" is disabled. + ''' + if not conf.SvcConf().sticky_connections: + if self._udev.get_registered_action_cback('add') is None: + self._udev.register_for_action_events('add', self._on_add_event) + self._audit_connections(self._udev.get_nvme_ioc_tids()) + else: + self._udev.unregister_for_action_events('add') def _keep_connections_on_exit(self): '''@brief Determine whether connections should remain when the process exits. - - NOTE) This is the base class method used to define the interface. - It must be overloaded by a child class. ''' - raise NotImplementedError() + return True - def _stop_hdlr(self): - systemd.daemon.notify('STOPPING=1') + def _reload_hdlr(self): + '''@brief Reload configuration file. This is triggered by the SIGHUP + signal, which can be sent with "systemctl reload stacd". + ''' + systemd.daemon.notify('RELOADING=1') + service_cnf = conf.SvcConf() + service_cnf.reload() + self.tron = service_cnf.tron + self._config_connections_audit() + self._cfg_soak_tmr.start(self.CONF_STABILITY_SOAK_TIME_SEC) + udev_rule_ctrl(service_cnf.udev_rule_enabled) + systemd.daemon.notify('READY=1') + return GLib.SOURCE_CONTINUE + + def _get_log_pages_from_stafd(self): + if self._staf: + try: + return json.loads(self._staf.get_all_log_pages(True)) + except dasbus.error.DBusError: + pass + + return list() - self._cancel() # Cancel pending operations + def _config_ctrls_finish(self, configured_ctrl_list): + configured_ctrl_list = [ + ctrl_dict for ctrl_dict in configured_ctrl_list if 'traddr' in ctrl_dict and 'subsysnqn' in ctrl_dict + ] + logging.debug('Stac._config_ctrls_finish() - configured_ctrl_list = %s', configured_ctrl_list) + + discovered_ctrl_list = list() + for staf_data in self._get_log_pages_from_stafd(): + host_traddr = staf_data['discovery-controller']['host-traddr'] + host_iface = staf_data['discovery-controller']['host-iface'] + for dlpe in staf_data['log-pages']: + if dlpe.get('subtype') == 'nvme': # eliminate discovery controllers + discovered_ctrl_list.append(stas.cid_from_dlpe(dlpe, host_traddr, host_iface)) + + logging.debug('Stac._config_ctrls_finish() - discovered_ctrl_list = %s', discovered_ctrl_list) + + controllers = stas.remove_blacklisted(configured_ctrl_list + discovered_ctrl_list) + controllers = stas.remove_invalid_addresses(controllers) + + new_controller_ids = {trid.TID(controller) for controller in controllers} + cur_controller_ids = set(self._controllers.keys()) + controllers_to_add = new_controller_ids - cur_controller_ids + controllers_to_del = cur_controller_ids - new_controller_ids + + logging.debug('Stac._config_ctrls_finish() - controllers_to_add = %s', list(controllers_to_add)) + logging.debug('Stac._config_ctrls_finish() - controllers_to_del = %s', list(controllers_to_del)) + + for tid in controllers_to_del: + controller = self._controllers.pop(tid, None) + if controller is not None: + controller.disconnect(self.remove_controller, conf.SvcConf().sticky_connections) + + for tid in controllers_to_add: + self._controllers[tid] = ctrl.Ioc(self, self._root, self._host, tid) + + def _connect_to_staf(self, _): + '''@brief Hook up DBus signal handlers for signals from stafd.''' + try: + self._staf = self._sysbus.get_proxy(defs.STAFD_DBUS_NAME, defs.STAFD_DBUS_PATH) + self._staf.log_pages_changed.connect(self._log_pages_changed) + self._cfg_soak_tmr.start() - self._dump_last_known_config(self._controllers) + # Make sure timer is set back to its normal value. + self._cfg_soak_tmr.set_timeout(self.CONF_STABILITY_SOAK_TIME_SEC) + logging.debug('Stac._connect_to_staf() - Connected to staf') + except dasbus.error.DBusError: + logging.error('Failed to connect to staf') + + def _destroy_staf_comlink(self, watcher): # pylint: disable=unused-argument + if self._staf: + self._staf.log_pages_changed.disconnect(self._log_pages_changed) + dasbus.client.proxy.disconnect_proxy(self._staf) + self._staf = None + + def _disconnect_from_staf(self, watcher): + self._destroy_staf_comlink(watcher) + + # When we lose connectivity with stafd, the most logical explanation + # is that stafd restarted. In that case, it may take some time for stafd + # to re-populate its log pages cache. So let's give stafd plenty of time + # to update its log pages cache and send log pages change notifications + # before triggering a stacd re-config. We do this by momentarily + # increasing the config soak timer to a longer period. + if self._cfg_soak_tmr: + self._cfg_soak_tmr.set_timeout(self.CONF_STABILITY_LONG_SOAK_TIME_SEC) + + logging.debug('Stac._disconnect_from_staf() - Disconnected from staf') + + def _log_pages_changed( # pylint: disable=too-many-arguments + self, transport, traddr, trsvcid, host_traddr, host_iface, subsysnqn, device + ): + logging.debug( + 'Stac._log_pages_changed() - transport=%s, traddr=%s, trsvcid=%s, host_traddr=%s, host_iface=%s, subsysnqn=%s, device=%s', + transport, + traddr, + trsvcid, + host_traddr, + host_iface, + subsysnqn, + device, + ) + if self._cfg_soak_tmr: + self._cfg_soak_tmr.start(self.CONF_STABILITY_SOAK_TIME_SEC) - if len(self._controllers) == 0: - GLib.idle_add(self._exit) - else: - # Tell all controller objects to disconnect - keep_connections = self._keep_connections_on_exit() - controllers = self._controllers.values() - for controller in controllers: - controller.disconnect(self._on_final_disconnect, keep_connections) + def _load_last_known_config(self): + return dict() - return GLib.SOURCE_REMOVE + def _dump_last_known_config(self, controllers): + pass - def _on_final_disconnect(self, controller): - '''Callback invoked after a controller is disconnected. - THIS IS USED DURING PROCESS SHUTDOWN TO WAIT FOR ALL CONTROLLERS TO BE - DISCONNECTED BEFORE EXITING THE PROGRAM. ONLY CALL ON SHUTDOWN! - ''' - logging.debug('Service._on_final_disconnect()') - self._remove_ctrl_from_dict(controller) - controller.kill() +# ****************************************************************************** +class Staf(Service): + '''STorage Appliance Finder (STAF)''' - # When all controllers have disconnected, we can finish the clean up - if len(self._controllers) == 0: - # Defer exit to the next main loop's idle period. - GLib.idle_add(self._exit) + def __init__(self, args, dbus): + super().__init__(args, self._reload_hdlr) - def _exit(self): - logging.debug('Service._exit()') - self._release_resources() - self._loop.quit() + self._avahi = avahi.Avahi(self._sysbus, self._avahi_change) + self._avahi.config_stypes(conf.SvcConf().get_stypes()) - def _on_config_ctrls(self, *_user_data): - self._config_ctrls() - return GLib.SOURCE_REMOVE + # Create the D-Bus instance. + self._config_dbus(dbus, defs.STAFD_DBUS_NAME, defs.STAFD_DBUS_PATH) - def _config_ctrls(self): - '''@brief Start controllers configuration.''' - # The configuration file may contain controllers and/or blacklist - # elements with traddr specified as hostname instead of IP address. - # Because of this, we need to remove those blacklisted elements before - # running name resolution. And we will need to remove blacklisted - # elements after name resolution is complete (i.e. in the calback - # function _config_ctrls_finish) - logging.debug('Service._config_ctrls()') - configured_controllers = stas.remove_blacklisted(conf.SvcConf().get_controllers()) - self._resolver.resolve_ctrl_async(self._cancellable, configured_controllers, self._config_ctrls_finish) + def info(self) -> dict: + '''@brief Get the status info for this object (used for debug)''' + info = super().info() + info['avahi'] = self._avahi.info() + return info - def _config_ctrls_finish(self, configured_ctrl_list): - '''@brief Finish controllers configuration after hostnames (if any) - have been resolved. - - Configuring controllers must be done asynchronously in 2 steps. - In the first step, host names get resolved to find their IP addresses. - Name resolution can take a while, especially when an external name - resolution server is used. Once that step completed, the callback - method _config_ctrls_finish() (i.e. this method), gets invoked to - complete the controller configuration. - - NOTE) This is the base class method used to define the interface. - It must be overloaded by a child class. - ''' - raise NotImplementedError() + def _release_resources(self): + logging.debug('Staf._release_resources()') + super()._release_resources() + if self._avahi: + self._avahi.kill() + self._avahi = None def _load_last_known_config(self): - raise NotImplementedError() + try: + with open(self._lkc_file, 'rb') as file: + config = pickle.load(file) + except (FileNotFoundError, AttributeError): + return dict() + + logging.debug('Staf._load_last_known_config() - DC count = %s', len(config)) + return {tid: ctrl.Dc(self, self._root, self._host, tid, log_pages) for tid, log_pages in config.items()} def _dump_last_known_config(self, controllers): - raise NotImplementedError() + try: + with open(self._lkc_file, 'wb') as file: + config = {tid: dc.log_pages() for tid, dc in controllers.items()} + logging.debug('Staf._dump_last_known_config() - DC count = %s', len(config)) + pickle.dump(config, file) + except FileNotFoundError as ex: + logging.error('Unable to save last known config: %s', ex) + + def _keep_connections_on_exit(self): + '''@brief Determine whether connections should remain when the + process exits. + ''' + return conf.SvcConf().persistent_connections + + def _reload_hdlr(self): + '''@brief Reload configuration file. This is triggered by the SIGHUP + signal, which can be sent with "systemctl reload stafd". + ''' + systemd.daemon.notify('RELOADING=1') + service_cnf = conf.SvcConf() + service_cnf.reload() + self.tron = service_cnf.tron + self._avahi.kick_start() # Make sure Avahi is running + self._avahi.config_stypes(service_cnf.get_stypes()) + self._cfg_soak_tmr.start() + systemd.daemon.notify('READY=1') + return GLib.SOURCE_CONTINUE + + def log_pages_changed(self, controller, device): + '''@brief Function invoked when a controller's cached log pages + have changed. This will emit a D-Bus signal to inform + other applications that the cached log pages have changed. + ''' + self._dbus_iface.log_pages_changed.emit( + controller.tid.transport, + controller.tid.traddr, + controller.tid.trsvcid, + controller.tid.host_traddr, + controller.tid.host_iface, + controller.tid.subsysnqn, + device, + ) + + def referrals_changed(self): + '''@brief Function invoked when a controller's cached referrals + have changed. + ''' + logging.debug('Staf.referrals_changed()') + self._cfg_soak_tmr.start() + + def _referrals(self) -> list: + return [ + stas.cid_from_dlpe(dlpe, controller.tid.host_traddr, controller.tid.host_iface) + for controller in self.get_controllers() + for dlpe in controller.referrals() + ] + + def _config_ctrls_finish(self, configured_ctrl_list): + '''@brief Finish discovery controllers configuration after + hostnames (if any) have been resolved. + ''' + configured_ctrl_list = [ + ctrl_dict + for ctrl_dict in configured_ctrl_list + if 'traddr' in ctrl_dict and ctrl_dict.setdefault('subsysnqn', defs.WELL_KNOWN_DISC_NQN) + ] + + discovered_ctrl_list = self._avahi.get_controllers() + referral_ctrl_list = self._referrals() + logging.debug('Staf._config_ctrls_finish() - configured_ctrl_list = %s', configured_ctrl_list) + logging.debug('Staf._config_ctrls_finish() - discovered_ctrl_list = %s', discovered_ctrl_list) + logging.debug('Staf._config_ctrls_finish() - referral_ctrl_list = %s', referral_ctrl_list) + + controllers = stas.remove_blacklisted(configured_ctrl_list + discovered_ctrl_list + referral_ctrl_list) + controllers = stas.remove_invalid_addresses(controllers) + + new_controller_ids = {trid.TID(controller) for controller in controllers} + cur_controller_ids = set(self._controllers.keys()) + controllers_to_add = new_controller_ids - cur_controller_ids + controllers_to_del = cur_controller_ids - new_controller_ids + + logging.debug('Staf._config_ctrls_finish() - controllers_to_add = %s', list(controllers_to_add)) + logging.debug('Staf._config_ctrls_finish() - controllers_to_del = %s', list(controllers_to_del)) + + for tid in controllers_to_del: + controller = self._controllers.pop(tid, None) + if controller is not None: + controller.disconnect(self.remove_controller, conf.SvcConf().persistent_connections) + + for tid in controllers_to_add: + self._controllers[tid] = ctrl.Dc(self, self._root, self._host, tid) + + def _avahi_change(self): + self._cfg_soak_tmr.start() diff --git a/staslib/stas.py b/staslib/stas.py index 7bf91e0..496f063 100644 --- a/staslib/stas.py +++ b/staslib/stas.py @@ -6,14 +6,19 @@ # # Authors: Martin Belanger # -'''Library for staf/stac''' +'''Library for staf/stac. You will find here common code for stafd and stacd +including the Abstract Base Classes (ABC) for Controllers and Services''' import os import sys -import ipaddress +import abc +import signal import logging - -from staslib import conf, defs, trid +import ipaddress +import systemd.daemon +import dasbus.connection +from gi.repository import Gio, GLib +from staslib import conf, defs, gutil, log, trid # ****************************************************************************** @@ -108,3 +113,379 @@ def remove_invalid_addresses(controllers: list): logging.warning('Invalid transport %s', transport) return valid_controllers + + +# ****************************************************************************** +class ControllerABC(abc.ABC): # pylint: disable=too-many-instance-attributes + '''@brief Base class used to manage the connection to a controller.''' + + CONNECT_RETRY_PERIOD_SEC = 60 + FAST_CONNECT_RETRY_PERIOD_SEC = 3 + + def __init__(self, root, host, tid: trid.TID, discovery_ctrl=False): + self._root = root + self._host = host + self._tid = tid + self._cancellable = Gio.Cancellable() + self._connect_attempts = 0 + self._retry_connect_tmr = gutil.GTimer(self.CONNECT_RETRY_PERIOD_SEC, self._on_try_to_connect) + self._discovery_ctrl = discovery_ctrl + self._try_to_connect_deferred = gutil.Deferred(self._try_to_connect) + self._try_to_connect_deferred.schedule() + + def _release_resources(self): + # Remove pending deferred from main loop + if self._try_to_connect_deferred: + self._try_to_connect_deferred.cancel() + + if self._retry_connect_tmr is not None: + self._retry_connect_tmr.kill() + + if self._cancellable and not self._cancellable.is_cancelled(): + self._cancellable.cancel() + + self._tid = None + self._cancellable = None + self._retry_connect_tmr = None + self._try_to_connect_deferred = None + + @property + def id(self) -> str: + '''@brief Return the Transport ID as a printable string''' + return str(self.tid) + + @property + def tid(self): + '''@brief Return the Transport ID object''' + return self._tid + + def controller_id_dict(self) -> dict: + '''@brief return the controller ID as a dict.''' + return self.tid.as_dict() + + def details(self) -> dict: + '''@brief return detailed debug info about this controller''' + details = self.controller_id_dict() + details['connect attempts'] = str(self._connect_attempts) + details['retry connect timer'] = str(self._retry_connect_tmr) + return details + + def info(self) -> dict: + '''@brief Get the controller info for this object''' + return self.details() + + def cancel(self): + '''@brief Used to cancel pending operations.''' + if self._cancellable and not self._cancellable.is_cancelled(): + logging.debug('ControllerABC.cancel() - %s', self.id) + self._cancellable.cancel() + + def kill(self): + '''@brief Used to release all resources associated with this object.''' + logging.debug('ControllerABC.kill() - %s', self.id) + self._release_resources() + + def _alive(self): + '''There may be race condition where a queued event gets processed + after the object is no longer configured (i.e. alive). This method + can be used by callback functions to make sure the object is still + alive before processing further. + ''' + return self._cancellable and not self._cancellable.is_cancelled() + + def _on_try_to_connect(self): + self._try_to_connect_deferred.schedule() + return GLib.SOURCE_REMOVE + + def _try_to_connect(self): + # This is a deferred function call. Make sure + # the source of the deferred is still good. + source = GLib.main_current_source() + if source and source.is_destroyed(): + return + + self._connect_attempts += 1 + + self._do_connect() + + @abc.abstractmethod + def _do_connect(self): + raise NotImplementedError() + + @abc.abstractmethod + def _on_aen(self, aen: int): + raise NotImplementedError() + + @abc.abstractmethod + def _on_nvme_event(self, nvme_event): + raise NotImplementedError() + + @abc.abstractmethod + def _on_ctrl_removed(self, obj): + raise NotImplementedError() + + @abc.abstractmethod + def _find_existing_connection(self): + raise NotImplementedError() + + @abc.abstractmethod + def disconnect(self, disconnected_cb, keep_connection): + '''@brief Issue an asynchronous disconnect command to a Controller. + Once the async command has completed, the callback 'disconnected_cb' + will be invoked. If a controller is already disconnected, then the + callback will be added to the main loop's next idle slot to be executed + ASAP. + ''' + raise NotImplementedError() + + +# ****************************************************************************** +class ServiceABC(abc.ABC): # pylint: disable=too-many-instance-attributes + '''@brief Base class used to manage a STorage Appliance Service''' + + CONF_STABILITY_SOAK_TIME_SEC = 1.5 + + def __init__(self, args, reload_hdlr): + + service_conf = conf.SvcConf() + service_conf.set_conf_file(args.conf_file) # reload configuration + self._tron = args.tron or service_conf.tron + log.set_level_from_tron(self._tron) + + self._lkc_file = os.path.join(os.environ.get('RUNTIME_DIRECTORY', os.path.join('/run', defs.PROG_NAME)), 'last-known-config.pickle') + self._loop = GLib.MainLoop() + self._cancellable = Gio.Cancellable() + self._resolver = gutil.NameResolver() + self._controllers = self._load_last_known_config() + self._dbus_iface = None + self._cfg_soak_tmr = gutil.GTimer(self.CONF_STABILITY_SOAK_TIME_SEC, self._on_config_ctrls) + self._sysbus = dasbus.connection.SystemMessageBus() + + GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGINT, self._stop_hdlr) # CTRL-C + GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGTERM, self._stop_hdlr) # systemctl stop stafd + GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGHUP, reload_hdlr) # systemctl reload stafd + + nvme_options = conf.NvmeOptions() + if not nvme_options.host_iface_supp or not nvme_options.discovery_supp: + logging.warning( + 'Kernel does not appear to support all the options needed to run this program. Consider updating to a later kernel version.' + ) + + # We don't want to apply configuration changes to nvme-cli right away. + # Often, multiple changes will occur in a short amount of time (sub-second). + # We want to wait until there are no more changes before applying them + # to the system. The following timer acts as a "soak period". Changes + # will be applied by calling self._on_config_ctrls() at the end of + # the soak period. + self._cfg_soak_tmr.start() + + def _release_resources(self): + logging.debug('ServiceABC._release_resources()') + + if self._cancellable and not self._cancellable.is_cancelled(): + self._cancellable.cancel() + + if self._cfg_soak_tmr is not None: + self._cfg_soak_tmr.kill() + + self._controllers.clear() + + if self._sysbus: + self._sysbus.disconnect() + + self._cfg_soak_tmr = None + self._cancellable = None + self._resolver = None + self._lkc_file = None + self._sysbus = None + + def _config_dbus(self, iface_obj, bus_name: str, obj_name: str): + self._dbus_iface = iface_obj + self._sysbus.publish_object(obj_name, iface_obj) + self._sysbus.register_service(bus_name) + + @property + def tron(self): + '''@brief Get Trace ON property''' + return self._tron + + @tron.setter + def tron(self, value): + '''@brief Set Trace ON property''' + self._tron = value + log.set_level_from_tron(self._tron) + + def run(self): + '''@brief Start the main loop execution''' + try: + self._loop.run() + except Exception as ex: # pylint: disable=broad-except + logging.critical('exception: %s', ex) + + self._loop = None + + def info(self) -> dict: + '''@brief Get the status info for this object (used for debug)''' + nvme_options = conf.NvmeOptions() + return { + 'last known config file': self._lkc_file, + 'config soak timer': str(self._cfg_soak_tmr), + 'kernel support': { + 'TP8013': nvme_options.discovery_supp, + 'host_iface': nvme_options.host_iface_supp, + }, + 'system config': conf.SysConf().as_dict(), + } + + def get_controllers(self) -> dict: + '''@brief return the list of controller objects''' + return self._controllers.values() + + def get_controller( + self, transport: str, traddr: str, trsvcid: str, host_traddr: str, host_iface: str, subsysnqn: str + ): # pylint: disable=too-many-arguments + '''@brief get the specified controller object from the list of controllers''' + cid = { + 'transport': transport, + 'traddr': traddr, + 'trsvcid': trsvcid, + 'host-traddr': host_traddr, + 'host-iface': host_iface, + 'subsysnqn': subsysnqn, + } + return self._controllers.get(trid.TID(cid)) + + def _remove_ctrl_from_dict(self, controller): + tid_to_pop = controller.tid + if not tid_to_pop: + # Being paranoid. This should not happen, but let's say the + # controller object has been purged, but it is somehow still + # listed in self._controllers. + for tid, _controller in self._controllers.items(): + if _controller is controller: + tid_to_pop = tid + break + + if tid_to_pop: + logging.debug('ServiceABC._remove_ctrl_from_dict()- %s | %s', tid_to_pop, controller.device) + self._controllers.pop(tid_to_pop, None) + else: + logging.debug('ServiceABC._remove_ctrl_from_dict()- already removed') + + def remove_controller(self, controller, success): # pylint: disable=unused-argument + '''@brief remove the specified controller object from the list of controllers + @param controller: the controller object + @param success: whether the disconnect was successful''' + logging.debug('ServiceABC.remove_controller()') + if isinstance(controller, ControllerABC): + self._remove_ctrl_from_dict(controller) + + controller.kill() + + if self._cfg_soak_tmr: + self._cfg_soak_tmr.start() + + def _cancel(self): + logging.debug('ServiceABC._cancel()') + if not self._cancellable.is_cancelled(): + self._cancellable.cancel() + + for controller in self._controllers.values(): + controller.cancel() + + def _stop_hdlr(self): + logging.debug('ServiceABC._stop_hdlr()') + systemd.daemon.notify('STOPPING=1') + + self._cancel() # Cancel pending operations + + self._dump_last_known_config(self._controllers) + + if len(self._controllers) == 0: + GLib.idle_add(self._exit) + else: + # Tell all controller objects to disconnect + keep_connections = self._keep_connections_on_exit() + controllers = self._controllers.values() + logging.debug( + 'ServiceABC._stop_hdlr() - Controller count = %s, keep_connections = %s', + len(controllers), keep_connections + ) + for controller in controllers: + controller.disconnect(self._on_final_disconnect, keep_connections) + + return GLib.SOURCE_REMOVE + + def _on_final_disconnect(self, controller, success): + '''Callback invoked after a controller is disconnected. + THIS IS USED DURING PROCESS SHUTDOWN TO WAIT FOR ALL CONTROLLERS TO BE + DISCONNECTED BEFORE EXITING THE PROGRAM. ONLY CALL ON SHUTDOWN! + @param controller: the controller object + @param success: whether the disconnect operation was successful + ''' + logging.debug('ServiceABC._on_final_disconnect() - %s | %s disconnect %s', + controller.id, controller.device, 'succeeded' if success else 'failed') + self._remove_ctrl_from_dict(controller) + + controller.kill() + + # When all controllers have disconnected, we can finish the clean up + if len(self._controllers) == 0: + # Defer exit to the next main loop's idle period. + GLib.idle_add(self._exit) + + def _exit(self): + logging.debug('ServiceABC._exit()') + self._release_resources() + self._loop.quit() + + def _on_config_ctrls(self, *_user_data): + self._config_ctrls() + return GLib.SOURCE_REMOVE + + def _config_ctrls(self): + '''@brief Start controllers configuration.''' + # The configuration file may contain controllers and/or blacklist + # elements with traddr specified as hostname instead of IP address. + # Because of this, we need to remove those blacklisted elements before + # running name resolution. And we will need to remove blacklisted + # elements after name resolution is complete (i.e. in the calback + # function _config_ctrls_finish) + logging.debug('ServiceABC._config_ctrls()') + configured_controllers = remove_blacklisted(conf.SvcConf().get_controllers()) + self._resolver.resolve_ctrl_async(self._cancellable, configured_controllers, self._config_ctrls_finish) + + @abc.abstractmethod + def _keep_connections_on_exit(self): + '''@brief Determine whether connections should remain when the + process exits. + + NOTE) This is the base class method used to define the interface. + It must be overloaded by a child class. + ''' + raise NotImplementedError() + + @abc.abstractmethod + def _config_ctrls_finish(self, configured_ctrl_list): + '''@brief Finish controllers configuration after hostnames (if any) + have been resolved. + + Configuring controllers must be done asynchronously in 2 steps. + In the first step, host names get resolved to find their IP addresses. + Name resolution can take a while, especially when an external name + resolution server is used. Once that step completed, the callback + method _config_ctrls_finish() (i.e. this method), gets invoked to + complete the controller configuration. + + NOTE) This is the base class method used to define the interface. + It must be overloaded by a child class. + ''' + raise NotImplementedError() + + @abc.abstractmethod + def _load_last_known_config(self): + raise NotImplementedError() + + @abc.abstractmethod + def _dump_last_known_config(self, controllers): + raise NotImplementedError() diff --git a/staslib/trid.py b/staslib/trid.py index def6ab2..38619e7 100644 --- a/staslib/trid.py +++ b/staslib/trid.py @@ -12,8 +12,7 @@ throughout nvme-stas to uniquely identify a Controller''' import hashlib from staslib import conf -class TID: - # pylint: disable=too-many-instance-attributes +class TID: # pylint: disable=too-many-instance-attributes '''Transport Identifier''' RDMA_IP_PORT = '4420' DISC_IP_PORT = '8009' diff --git a/staslib/udev.py b/staslib/udev.py index 29370b8..37b63cc 100644 --- a/staslib/udev.py +++ b/staslib/udev.py @@ -16,7 +16,7 @@ from staslib import defs, trid try: from pyudev.glib import MonitorObserver except (ModuleNotFoundError, AttributeError): - from staslib.glibudev import MonitorObserver # pylint: disable=relative-beyond-top-level,ungrouped-imports + from staslib.glibudev import MonitorObserver # pylint: disable=ungrouped-imports # ****************************************************************************** class Udev: @@ -99,7 +99,7 @@ class Udev: def get_attributes(self, sys_name: str, attr_ids) -> dict: '''@brief Get all the attributes associated with device @sys_name''' attrs = {attr_id: '' for attr_id in attr_ids} - if sys_name: + if sys_name and sys_name != 'nvme?': udev = self.get_nvme_device(sys_name) if udev is not None: for attr_id in attr_ids: diff --git a/test/test-config.py b/test/test-config.py index dad0ebd..db58883 100755 --- a/test/test-config.py +++ b/test/test-config.py @@ -40,7 +40,7 @@ class StasProcessConfUnitTest(unittest.TestCase): self.assertFalse(service_conf.data_digest) self.assertTrue(service_conf.persistent_connections) self.assertTrue(service_conf.udev_rule_enabled) - self.assertFalse(service_conf.sticky_connections) + self.assertTrue(service_conf.sticky_connections) self.assertFalse(service_conf.ignore_iface) self.assertIn(6, service_conf.ip_family) self.assertNotIn(4, service_conf.ip_family) diff --git a/test/test-controller.py b/test/test-controller.py index f23125e..f55781a 100755 --- a/test/test-controller.py +++ b/test/test-controller.py @@ -8,24 +8,43 @@ from pyfakefs.fake_filesystem_unittest import TestCase LOOP = GLib.MainLoop() + +class TestController(ctrl.Controller): + def _find_existing_connection(self): + pass + + def _on_aen(self, aen: int): + pass + + def _on_nvme_event(self, nvme_event): + pass + + class Test(TestCase): '''Unit tests for class Controller''' def setUp(self): self.setUpPyfakefs() - self.fs.create_file('/etc/nvme/hostnqn', contents='nqn.2014-08.org.nvmexpress:uuid:01234567-0123-0123-0123-0123456789ab\n') - self.fs.create_file('/etc/nvme/hostid', contents='01234567-89ab-cdef-0123-456789abcdef\n') - self.fs.create_file('/dev/nvme-fabrics', contents='instance=-1,cntlid=-1,transport=%s,traddr=%s,trsvcid=%s,nqn=%s,queue_size=%d,nr_io_queues=%d,reconnect_delay=%d,ctrl_loss_tmo=%d,keep_alive_tmo=%d,hostnqn=%s,host_traddr=%s,host_iface=%s,hostid=%s,duplicate_connect,disable_sqflow,hdr_digest,data_digest,nr_write_queues=%d,nr_poll_queues=%d,tos=%d,fast_io_fail_tmo=%d,discovery,dhchap_secret=%s,dhchap_ctrl_secret=%s\n') + self.fs.create_file( + '/etc/nvme/hostnqn', contents='nqn.2014-08.org.nvmexpress:uuid:01234567-0123-0123-0123-0123456789ab\n' + ) + self.fs.create_file('/etc/nvme/hostid', contents='01234567-89ab-cdef-0123-456789abcdef\n') + self.fs.create_file( + '/dev/nvme-fabrics', + contents='instance=-1,cntlid=-1,transport=%s,traddr=%s,trsvcid=%s,nqn=%s,queue_size=%d,nr_io_queues=%d,reconnect_delay=%d,ctrl_loss_tmo=%d,keep_alive_tmo=%d,hostnqn=%s,host_traddr=%s,host_iface=%s,hostid=%s,duplicate_connect,disable_sqflow,hdr_digest,data_digest,nr_write_queues=%d,nr_poll_queues=%d,tos=%d,fast_io_fail_tmo=%d,discovery,dhchap_secret=%s,dhchap_ctrl_secret=%s\n', + ) - self.NVME_TID = trid.TID({ - 'transport': 'tcp', - 'traddr': '10.10.10.10', - 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', - 'trsvcid': '8009', - 'host-traddr': '1.2.3.4', - 'host-iface': 'wlp0s20f3', - }) + self.NVME_TID = trid.TID( + { + 'transport': 'tcp', + 'traddr': '10.10.10.10', + 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', + 'trsvcid': '8009', + 'host-traddr': '1.2.3.4', + 'host-iface': 'wlp0s20f3', + } + ) sysconf = conf.SysConf() self.root = nvme.root() @@ -34,32 +53,92 @@ class Test(TestCase): def tearDown(self): LOOP.quit() + def test_cannot_instantiate_concrete_classes_if_abstract_method_are_not_implemented(self): + # Make sure we can't instantiate the ABC directly (Abstract Base Class). + class Controller(ctrl.Controller): + pass + + self.assertRaises(TypeError, lambda: ctrl.Controller(root=self.root, host=self.host, tid=self.NVME_TID)) + def test_get_device(self): - controller = ctrl.Controller(root=self.root, host=self.host, tid=self.NVME_TID) + controller = TestController(root=self.root, host=self.host, tid=self.NVME_TID) self.assertEqual(controller._connect_attempts, 0) - self.assertRaises(NotImplementedError, controller._try_to_connect) + controller._try_to_connect() self.assertEqual(controller._connect_attempts, 1) - self.assertRaises(NotImplementedError, controller._find_existing_connection) - self.assertEqual(controller.id, "(tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4)") + self.assertEqual( + controller.id, "(tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4)" + ) # raise Exception(controller._connect_op) - self.assertEqual(str(controller.tid), "(tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4)") - self.assertEqual(controller.device, '') - self.assertEqual(str(controller.controller_id_dict()), "{'transport': 'tcp', 'traddr': '10.10.10.10', 'trsvcid': '8009', 'host-traddr': '1.2.3.4', 'host-iface': 'wlp0s20f3', 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', 'device': ''}") - # self.assertEqual(controller.details(), "{'transport': 'tcp', 'traddr': '10.10.10.[265 chars]ff]'}") - self.assertEqual(controller.info(), {'transport': 'tcp', 'traddr': '10.10.10.10', 'trsvcid': '8009', 'host-traddr': '1.2.3.4', 'host-iface': 'wlp0s20f3', 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', 'device': '', 'hostid': '', 'hostnqn': '', 'model': '', 'serial': '', 'connect attempts': '1', 'retry connect timer': '60.0s [off]'}) + self.assertEqual( + str(controller.tid), + "(tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4)", + ) + self.assertEqual(controller.device, 'nvme?') + self.assertEqual( + str(controller.controller_id_dict()), + "{'transport': 'tcp', 'traddr': '10.10.10.10', 'trsvcid': '8009', 'host-traddr': '1.2.3.4', 'host-iface': 'wlp0s20f3', 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', 'device': 'nvme?'}", + ) + self.assertEqual( + controller.details(), + { + 'dctype': '', + 'cntrltype': '', + 'transport': 'tcp', + 'traddr': '10.10.10.10', + 'trsvcid': '8009', + 'host-traddr': '1.2.3.4', + 'host-iface': 'wlp0s20f3', + 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', + 'device': 'nvme?', + 'connect attempts': '1', + 'retry connect timer': '60.0s [off]', + 'hostid': '', + 'hostnqn': '', + 'model': '', + 'serial': '', + }, + ) + self.assertEqual( + controller.info(), + { + 'dctype': '', + 'cntrltype': '', + 'transport': 'tcp', + 'traddr': '10.10.10.10', + 'trsvcid': '8009', + 'host-traddr': '1.2.3.4', + 'host-iface': 'wlp0s20f3', + 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', + 'device': 'nvme?', + 'connect attempts': '1', + 'retry connect timer': '60.0s [off]', + 'hostid': '', + 'hostnqn': '', + 'model': '', + 'serial': '', + 'connect operation': {'fail count': 0}, + }, + ) + # print(controller._connect_op) self.assertEqual(controller.cancel(), None) self.assertEqual(controller.kill(), None) # self.assertEqual(controller.disconnect(), 0) def test_connect(self): - controller = ctrl.Controller(root=self.root, host=self.host, tid=self.NVME_TID) + controller = TestController(root=self.root, host=self.host, tid=self.NVME_TID) self.assertEqual(controller._connect_attempts, 0) - controller._find_existing_connection = lambda : None + controller._find_existing_connection = lambda: None with self.assertLogs(logger=logging.getLogger(), level='DEBUG') as captured: controller._try_to_connect() self.assertEqual(len(captured.records), 1) - self.assertTrue(captured.records[0].getMessage().startswith("Controller._try_to_connect() - (tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4) Connecting to nvme control with cfg={'hdr_digest': False, 'data_digest': False")) + self.assertTrue( + captured.records[0] + .getMessage() + .startswith( + "Controller._try_to_connect() - (tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4) Connecting to nvme control with cfg={'hdr_digest': False, 'data_digest': False" + ) + ) self.assertEqual(controller._connect_attempts, 1) diff --git a/test/test-service.py b/test/test-service.py index 19f9b0c..4ce37be 100755 --- a/test/test-service.py +++ b/test/test-service.py @@ -4,6 +4,7 @@ import unittest from staslib import service from pyfakefs.fake_filesystem_unittest import TestCase + class Args: def __init__(self): self.tron = True @@ -11,6 +12,20 @@ class Args: self.conf_file = '/dev/null' +class TestService(service.Service): + def _config_ctrls_finish(self, configured_ctrl_list): + pass + + def _dump_last_known_config(self, controllers): + pass + + def _keep_connections_on_exit(self): + pass + + def _load_last_known_config(self): + return dict() + + class Test(TestCase): '''Unit tests for class Service''' @@ -18,22 +33,39 @@ class Test(TestCase): self.setUpPyfakefs() os.environ['RUNTIME_DIRECTORY'] = "/run" - self.fs.create_file('/etc/nvme/hostnqn', contents='nqn.2014-08.org.nvmexpress:uuid:01234567-0123-0123-0123-0123456789ab\n') - self.fs.create_file('/etc/nvme/hostid', contents='01234567-89ab-cdef-0123-456789abcdef\n') - self.fs.create_file('/dev/nvme-fabrics', contents='instance=-1,cntlid=-1,transport=%s,traddr=%s,trsvcid=%s,nqn=%s,queue_size=%d,nr_io_queues=%d,reconnect_delay=%d,ctrl_loss_tmo=%d,keep_alive_tmo=%d,hostnqn=%s,host_traddr=%s,host_iface=%s,hostid=%s,duplicate_connect,disable_sqflow,hdr_digest,data_digest,nr_write_queues=%d,nr_poll_queues=%d,tos=%d,fast_io_fail_tmo=%d,discovery,dhchap_secret=%s,dhchap_ctrl_secret=%s\n') + self.fs.create_file( + '/etc/nvme/hostnqn', contents='nqn.2014-08.org.nvmexpress:uuid:01234567-0123-0123-0123-0123456789ab\n' + ) + self.fs.create_file('/etc/nvme/hostid', contents='01234567-89ab-cdef-0123-456789abcdef\n') + self.fs.create_file( + '/dev/nvme-fabrics', + contents='instance=-1,cntlid=-1,transport=%s,traddr=%s,trsvcid=%s,nqn=%s,queue_size=%d,nr_io_queues=%d,reconnect_delay=%d,ctrl_loss_tmo=%d,keep_alive_tmo=%d,hostnqn=%s,host_traddr=%s,host_iface=%s,hostid=%s,duplicate_connect,disable_sqflow,hdr_digest,data_digest,nr_write_queues=%d,nr_poll_queues=%d,tos=%d,fast_io_fail_tmo=%d,discovery,dhchap_secret=%s,dhchap_ctrl_secret=%s\n', + ) + + def test_cannot_instantiate_concrete_classes_if_abstract_method_are_not_implemented(self): + # Make sure we can't instantiate the ABC directly (Abstract Base Class). + class Service(service.Service): + pass + + self.assertRaises(TypeError, lambda: Service(Args(), reload_hdlr=lambda x: x)) def test_get_controller(self): - # FIXME: this is hack, fix it later - service.Service._load_last_known_config = lambda x : dict() - # start the test - - srv = service.Service(Args(), reload_hdlr=lambda x : x) - self.assertRaises(NotImplementedError, srv._keep_connections_on_exit) - self.assertRaises(NotImplementedError, srv._dump_last_known_config, []) - self.assertRaises(NotImplementedError, srv._on_config_ctrls) - #self.assertEqual(srv.get_controllers(), dict()) - self.assertEqual(srv.get_controller(transport='tcp', traddr='10.10.10.10', trsvcid='8009', host_traddr='1.2.3.4', host_iface='wlp0s20f3', subsysnqn='nqn.1988-11.com.dell:SFSS:2:20220208134025e8'), None) - self.assertEqual(srv.remove_controller(controller=None), None) + srv = TestService(Args(), reload_hdlr=lambda x: x) + + self.assertEqual(list(srv.get_controllers()), list()) + self.assertEqual( + srv.get_controller( + transport='tcp', + traddr='10.10.10.10', + trsvcid='8009', + host_traddr='1.2.3.4', + host_iface='wlp0s20f3', + subsysnqn='nqn.1988-11.com.dell:SFSS:2:20220208134025e8', + ), + None, + ) + self.assertEqual(srv.remove_controller(controller=None, success=True), None) + if __name__ == '__main__': unittest.main()