glusterfs/0085-Revert-all-remove-code-which-is-not-being-considered.patch

8977 lines
284 KiB
Diff
Raw Permalink Normal View History

From 379b9f7247a4daac9545e3dec79d3c2660111d8d Mon Sep 17 00:00:00 2001
From: Hari Gowtham <hgowtham@redhat.com>
Date: Mon, 8 Apr 2019 11:32:09 +0530
Subject: [PATCH 085/124] Revert "all: remove code which is not being
considered in build"
This reverts most part of commit 8293d21280fd6ddfc9bb54068cf87794fc6be207.
It adds in the changes for tier and CTR with the neccesary changes for building it.
Label: DOWNSTREAM ONLY
Change-Id: I8f7978618f2a6a949b09dbcfd25722494cb8f1cd
Signed-off-by: Hari Gowtham <hgowtham@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/166245
Reviewed-by: Nithya Balachandran <nbalacha@redhat.com>
Tested-by: RHGS Build Bot <nigelb@redhat.com>
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
---
Makefile.am | 8 +-
configure.ac | 34 +
glusterfs.spec.in | 19 +
libglusterfs/Makefile.am | 4 +-
libglusterfs/src/glusterfs/mem-types.h | 1 +
xlators/cluster/dht/src/Makefile.am | 14 +-
xlators/cluster/dht/src/dht-rebalance.c | 12 +
xlators/cluster/dht/src/tier-common.c | 1199 ++++++++
xlators/cluster/dht/src/tier-common.h | 55 +
xlators/cluster/dht/src/tier.c | 3105 ++++++++++++++++++++
xlators/cluster/dht/src/tier.h | 110 +
xlators/features/Makefile.am | 2 +-
xlators/features/changetimerecorder/Makefile.am | 3 +
.../features/changetimerecorder/src/Makefile.am | 26 +
.../changetimerecorder/src/changetimerecorder.c | 2371 +++++++++++++++
.../changetimerecorder/src/changetimerecorder.h | 21 +
.../features/changetimerecorder/src/ctr-helper.c | 293 ++
.../features/changetimerecorder/src/ctr-helper.h | 854 ++++++
.../features/changetimerecorder/src/ctr-messages.h | 61 +
.../changetimerecorder/src/ctr-xlator-ctx.c | 362 +++
.../changetimerecorder/src/ctr-xlator-ctx.h | 68 +
.../changetimerecorder/src/ctr_mem_types.h | 22 +
22 files changed, 8637 insertions(+), 7 deletions(-)
create mode 100644 xlators/cluster/dht/src/tier-common.c
create mode 100644 xlators/cluster/dht/src/tier-common.h
create mode 100644 xlators/cluster/dht/src/tier.c
create mode 100644 xlators/cluster/dht/src/tier.h
create mode 100644 xlators/features/changetimerecorder/Makefile.am
create mode 100644 xlators/features/changetimerecorder/src/Makefile.am
create mode 100644 xlators/features/changetimerecorder/src/changetimerecorder.c
create mode 100644 xlators/features/changetimerecorder/src/changetimerecorder.h
create mode 100644 xlators/features/changetimerecorder/src/ctr-helper.c
create mode 100644 xlators/features/changetimerecorder/src/ctr-helper.h
create mode 100644 xlators/features/changetimerecorder/src/ctr-messages.h
create mode 100644 xlators/features/changetimerecorder/src/ctr-xlator-ctx.c
create mode 100644 xlators/features/changetimerecorder/src/ctr-xlator-ctx.h
create mode 100644 xlators/features/changetimerecorder/src/ctr_mem_types.h
diff --git a/Makefile.am b/Makefile.am
index e0c795f..613382f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -3,7 +3,7 @@ SOURCES = site.h
EXTRA_DIST = autogen.sh \
COPYING-GPLV2 COPYING-LGPLV3 COMMITMENT \
INSTALL README.md AUTHORS THANKS NEWS \
- glusterfs.spec glusterfs-api.pc.in libgfchangelog.pc.in \
+ glusterfs.spec glusterfs-api.pc.in libgfchangelog.pc.in libgfdb.pc.in \
run-tests.sh \
build-aux/pkg-version \
contrib/umountd \
@@ -15,8 +15,12 @@ SUBDIRS = $(ARGP_STANDALONE_DIR) rpc/xdr/gen libglusterfs rpc api xlators \
pkgconfigdir = @pkgconfigdir@
pkgconfig_DATA = glusterfs-api.pc libgfchangelog.pc
+if USE_GFDB
+pkgconfig_DATA += libgfdb.pc
+endif
-CLEANFILES = glusterfs-api.pc libgfchangelog.pc contrib/umountd/Makefile
+CLEANFILES = glusterfs-api.pc libgfchangelog.pc libgfdb.pc \
+ contrib/umountd/Makefile
gitclean: distclean
find . -name Makefile.in -exec rm -f {} \;
diff --git a/configure.ac b/configure.ac
index baa811a..633e850 100644
--- a/configure.ac
+++ b/configure.ac
@@ -30,6 +30,7 @@ AC_CONFIG_HEADERS([config.h site.h])
AC_CONFIG_FILES([Makefile
libglusterfs/Makefile
libglusterfs/src/Makefile
+ libglusterfs/src/gfdb/Makefile
geo-replication/src/peer_gsec_create
geo-replication/src/peer_mountbroker
geo-replication/src/peer_mountbroker.py
@@ -121,6 +122,8 @@ AC_CONFIG_FILES([Makefile
xlators/features/changelog/src/Makefile
xlators/features/changelog/lib/Makefile
xlators/features/changelog/lib/src/Makefile
+ xlators/features/changetimerecorder/Makefile
+ xlators/features/changetimerecorder/src/Makefile
xlators/features/locks/Makefile
xlators/features/locks/src/Makefile
xlators/features/quota/Makefile
@@ -237,6 +240,7 @@ AC_CONFIG_FILES([Makefile
contrib/umountd/Makefile
glusterfs-api.pc
libgfchangelog.pc
+ libgfdb.pc
api/Makefile
api/src/Makefile
api/examples/Makefile
@@ -866,6 +870,33 @@ AM_CONDITIONAL([USE_FIREWALLD],test ["x${BUILD_FIREWALLD}" = "xyes"])
#endof firewald section
+# Data tiering requires sqlite
+AC_ARG_ENABLE([tiering],
+ AC_HELP_STRING([--disable-tiering],
+ [Disable data classification/tiering]),
+ [BUILD_GFDB="${enableval}"], [BUILD_GFDB="yes"])
+
+case $host_os in
+ darwin*)
+ SQLITE_LIBS="-lsqlite3"
+ AC_CHECK_HEADERS([sqlite3.h], AC_DEFINE(USE_GFDB, 1))
+ ;;
+ *)
+ if test "x${BUILD_GFDB}" = "xyes"; then
+ PKG_CHECK_MODULES([SQLITE], [sqlite3],
+ AC_DEFINE(USE_GFDB, 1),
+ AC_MSG_ERROR([pass --disable-tiering to build without sqlite]))
+ else
+ AC_DEFINE(USE_GFDB, 0, [no sqlite, gfdb is disabled])
+ fi
+ ;;
+esac
+
+AC_SUBST(SQLITE_CFLAGS)
+AC_SUBST(SQLITE_LIBS)
+AM_CONDITIONAL(BUILD_GFDB, test "x${with_server}" = "xyes" -a "x${BUILD_GFDB}" = "xyes")
+AM_CONDITIONAL(USE_GFDB, test "x${with_server}" = "xyes" -a "x${BUILD_GFDB}" = "xyes")
+
# xml-output
AC_ARG_ENABLE([xml-output],
AC_HELP_STRING([--disable-xml-output],
@@ -1544,6 +1575,8 @@ GFAPI_VERSION="7."${PACKAGE_VERSION}
LIBGFCHANGELOG_VERSION="0.0.1"
AC_SUBST(GFAPI_VERSION)
AC_SUBST(LIBGFCHANGELOG_VERSION)
+LIBGFDB_VERSION="0.0.1"
+AC_SUBST(LIBGFDB_VERSION)
dnl libtool versioning
LIBGFXDR_LT_VERSION="0:1:0"
@@ -1584,6 +1617,7 @@ echo "XML output : $BUILD_XML_OUTPUT"
echo "Unit Tests : $BUILD_UNITTEST"
echo "Track priv ports : $TRACK_PRIVPORTS"
echo "POSIX ACLs : $BUILD_POSIX_ACLS"
+echo "Data Classification : $BUILD_GFDB"
echo "firewalld-config : $BUILD_FIREWALLD"
echo "Events : $BUILD_EVENTS"
echo "EC dynamic support : $EC_DYNAMIC_SUPPORT"
diff --git a/glusterfs.spec.in b/glusterfs.spec.in
index 2149f86..e0607ba 100644
--- a/glusterfs.spec.in
+++ b/glusterfs.spec.in
@@ -154,6 +154,7 @@
%global _without_events --disable-events
%global _without_georeplication --disable-georeplication
%global _with_gnfs %{nil}
+%global _without_tiering --disable-tiering
%global _without_ocf --without-ocf
%endif
@@ -287,6 +288,9 @@ BuildRequires: libuuid-devel
%if ( 0%{?_with_cmocka:1} )
BuildRequires: libcmocka-devel >= 1.0.1
%endif
+%if ( 0%{!?_without_tiering:1} )
+BuildRequires: sqlite-devel
+%endif
%if ( 0%{!?_without_georeplication:1} )
BuildRequires: libattr-devel
%endif
@@ -797,6 +801,7 @@ export LDFLAGS
%{?_without_rdma} \
%{?_without_server} \
%{?_without_syslog} \
+ %{?_without_tiering} \
%{?_with_ipv6default} \
%{?_without_libtirpc}
@@ -1232,9 +1237,15 @@ exit 0
%if ( 0%{?_without_server:1} )
%exclude %{_libdir}/pkgconfig/libgfchangelog.pc
%exclude %{_libdir}/libgfchangelog.so
+%if ( 0%{!?_without_tiering:1} )
+%{_libdir}/pkgconfig/libgfdb.pc
+%endif
%else
%{_libdir}/pkgconfig/libgfchangelog.pc
%{_libdir}/libgfchangelog.so
+%if ( 0%{!?_without_tiering:1} )
+%{_libdir}/pkgconfig/libgfdb.pc
+%endif
%endif
%files client-xlators
@@ -1330,6 +1341,10 @@ exit 0
%files libs
%{_libdir}/*.so.*
%exclude %{_libdir}/libgfapi.*
+%if ( 0%{!?_without_tiering:1} )
+# libgfdb is only needed server-side
+%exclude %{_libdir}/libgfdb.*
+%endif
%files -n python%{_pythonver}-gluster
# introducing glusterfs module in site packages.
@@ -1417,6 +1432,10 @@ exit 0
%{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/bit-rot.so
%{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/bitrot-stub.so
%{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/sdfs.so
+%if ( 0%{!?_without_tiering:1} )
+ %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/changetimerecorder.so
+ %{_libdir}/libgfdb.so.*
+%endif
%{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/index.so
%{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/locks.so
%{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/posix*
diff --git a/libglusterfs/Makefile.am b/libglusterfs/Makefile.am
index d471a3f..7e72f61 100644
--- a/libglusterfs/Makefile.am
+++ b/libglusterfs/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = src
+SUBDIRS = src src/gfdb
-CLEANFILES =
+CLEANFILES =
diff --git a/libglusterfs/src/glusterfs/mem-types.h b/libglusterfs/src/glusterfs/mem-types.h
index 832f68c..92730a9 100644
--- a/libglusterfs/src/glusterfs/mem-types.h
+++ b/libglusterfs/src/glusterfs/mem-types.h
@@ -138,6 +138,7 @@ enum gf_common_mem_types_ {
gf_common_volfile_t,
gf_common_mt_mgmt_v3_lock_timer_t, /* used only in one location */
gf_common_mt_server_cmdline_t, /* used only in one location */
+ gf_mt_gfdb_query_record_t,
gf_common_mt_end
};
#endif
diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am
index 56f1f2a..5532047 100644
--- a/xlators/cluster/dht/src/Makefile.am
+++ b/xlators/cluster/dht/src/Makefile.am
@@ -1,4 +1,7 @@
xlator_LTLIBRARIES = dht.la nufa.la switch.la
+if BUILD_GFDB
+ xlator_LTLIBRARIES += tier.la
+endif
AM_CFLAGS = -Wall $(GF_CFLAGS)
@@ -13,6 +16,7 @@ dht_la_SOURCES = $(dht_common_source) dht.c
nufa_la_SOURCES = $(dht_common_source) nufa.c
switch_la_SOURCES = $(dht_common_source) switch.c
+tier_la_SOURCES = $(dht_common_source) tier.c tier-common.c
dht_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
@@ -23,15 +27,21 @@ nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
switch_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+tier_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) $(LIB_DL)
+tier_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h \
- dht-lock.h $(top_builddir)/xlators/lib/src/libxlator.h
+ dht-lock.h tier-common.h tier.h \
+ $(top_builddir)/xlators/lib/src/libxlator.h
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/libglusterfs/src/gfdb \
-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
-I$(top_srcdir)/rpc/rpc-lib/src \
-I$(top_srcdir)/xlators/lib/src \
-DDATADIR=\"$(localstatedir)\" \
- -DLIBDIR=\"$(libdir)\"
+ -DLIBDIR=\"$(libdir)\" \
+ -DLIBGFDB_VERSION=\"$(LIBGFDB_VERSION)\"
CLEANFILES =
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index e0f25b1..efbe8a4 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -8,6 +8,7 @@
cases as published by the Free Software Foundation.
*/
+#include "tier.h"
#include "dht-common.h"
#include <glusterfs/xlator.h>
#include <glusterfs/syscall.h>
@@ -2134,6 +2135,17 @@ dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
}
}
+ /* store size of previous migrated file */
+ if (defrag && defrag->tier_conf.is_tier) {
+ if (from != TIER_HASHED_SUBVOL) {
+ defrag->tier_conf.st_last_promoted_size = stbuf.ia_size;
+ } else {
+ /* Don't delete the linkto file on the hashed subvol */
+ delete_src_linkto = _gf_false;
+ defrag->tier_conf.st_last_demoted_size = stbuf.ia_size;
+ }
+ }
+
/* The src file is being unlinked after this so we don't need
to clean it up */
clean_src = _gf_false;
diff --git a/xlators/cluster/dht/src/tier-common.c b/xlators/cluster/dht/src/tier-common.c
new file mode 100644
index 0000000..b22f477
--- /dev/null
+++ b/xlators/cluster/dht/src/tier-common.c
@@ -0,0 +1,1199 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include "libxlator.h"
+#include "dht-common.h"
+#include <glusterfs/defaults.h>
+#include "tier-common.h"
+#include "tier.h"
+
+int
+dht_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata);
+
+int
+tier_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ loc_t *oldloc = NULL;
+ loc_t *newloc = NULL;
+
+ local = frame->local;
+
+ oldloc = &local->loc;
+ newloc = &local->loc2;
+
+ if (op_ret == -1) {
+ /* No continuation on DHT inode missing errors, as we should
+ * then have a good stbuf that states P2 happened. We would
+ * get inode missing if, the file completed migrated between
+ * the lookup and the link call */
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ goto out;
+ }
+
+ local->call_cnt = 2;
+
+ /* Do this on the hot tier now */
+
+ STACK_WIND(frame, tier_link_cbk, local->cached_subvol,
+ local->cached_subvol->fops->link, oldloc, newloc, xdata);
+
+ return 0;
+
+out:
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+
+ DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent,
+ postparent, NULL);
+
+ return 0;
+}
+
+int
+tier_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
+{
+ xlator_t *cached_subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ int op_errno = -1;
+ int ret = -1;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(oldloc, err);
+ VALIDATE_OR_GOTO(newloc, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, oldloc, NULL, GF_FOP_LINK);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ local->call_cnt = 1;
+
+ cached_subvol = local->cached_subvol;
+
+ if (!cached_subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+ oldloc->path);
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ hashed_subvol = TIER_HASHED_SUBVOL;
+
+ ret = loc_copy(&local->loc2, newloc);
+ if (ret == -1) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (hashed_subvol == cached_subvol) {
+ STACK_WIND(frame, dht_link_cbk, cached_subvol,
+ cached_subvol->fops->link, oldloc, newloc, xdata);
+ return 0;
+ }
+
+ /* Create hardlinks to both the data file on the hot tier
+ and the linkto file on the cold tier */
+
+ gf_uuid_copy(local->gfid, oldloc->inode->gfid);
+
+ STACK_WIND(frame, tier_link_cbk, hashed_subvol, hashed_subvol->fops->link,
+ oldloc, newloc, xdata);
+
+ return 0;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+tier_create_unlink_stale_linkto_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (local->params) {
+ dict_del(local->params, GLUSTERFS_INTERNAL_FOP_KEY);
+ }
+
+ DHT_STACK_UNWIND(create, frame, -1, local->op_errno, NULL, NULL, NULL, NULL,
+ NULL, NULL);
+
+ return 0;
+}
+
+int
+tier_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ xlator_t *prev = NULL;
+ int ret = -1;
+ dht_local_t *local = NULL;
+ xlator_t *hashed_subvol = NULL;
+ dht_conf_t *conf = NULL;
+
+ local = frame->local;
+ conf = this->private;
+
+ hashed_subvol = TIER_HASHED_SUBVOL;
+
+ if (!local) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (op_ret == -1) {
+ if (local->linked == _gf_true && local->xattr_req) {
+ local->op_errno = op_errno;
+ local->op_ret = op_ret;
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(
+ local->xattr_req);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value to "
+ "unlink of migrating file");
+ goto out;
+ }
+
+ STACK_WIND(frame, tier_create_unlink_stale_linkto_cbk,
+ hashed_subvol, hashed_subvol->fops->unlink, &local->loc,
+ 0, local->xattr_req);
+ return 0;
+ }
+ goto out;
+ }
+
+ prev = cookie;
+
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0);
+
+ dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+ }
+
+ ret = dht_layout_preset(this, prev, inode);
+ if (ret != 0) {
+ gf_msg_debug(this->name, 0, "could not set preset layout for subvol %s",
+ prev->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ local->op_errno = op_errno;
+
+ if (local->linked == _gf_true) {
+ local->stbuf = *stbuf;
+ dht_linkfile_attr_heal(frame, this);
+ }
+out:
+ if (local) {
+ if (local->xattr_req) {
+ dict_del(local->xattr_req, TIER_LINKFILE_GFID);
+ }
+ }
+
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+
+ DHT_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf,
+ preparent, postparent, xdata);
+
+ return 0;
+}
+
+int
+tier_create_linkfile_create_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *cached_subvol = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ unsigned char *gfid = NULL;
+
+ local = frame->local;
+ if (!local) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto err;
+ }
+
+ conf = this->private;
+ if (!conf) {
+ local->op_errno = EINVAL;
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ cached_subvol = TIER_UNHASHED_SUBVOL;
+
+ if (local->params) {
+ dict_del(local->params, conf->link_xattr_name);
+ dict_del(local->params, GLUSTERFS_INTERNAL_FOP_KEY);
+ }
+
+ /*
+ * We will delete the linkfile if data file creation fails.
+ * When deleting this stale linkfile, there is a possibility
+ * for a race between this linkfile deletion and a stale
+ * linkfile deletion triggered by another lookup from different
+ * client.
+ *
+ * For eg:
+ *
+ * Client 1 Client 2
+ *
+ * 1 linkfile created for foo
+ *
+ * 2 data file creation failed
+ *
+ * 3 creating a file with same name
+ *
+ * 4 lookup before creation deleted
+ * the linkfile created by client1
+ * considering as a stale linkfile.
+ *
+ * 5 New linkfile created for foo
+ * with different gfid.
+ *
+ * 6 Trigger linkfile deletion as
+ * data file creation failed.
+ *
+ * 7 Linkfile deleted which is
+ * created by client2.
+ *
+ * 8 Data file created.
+ *
+ * With this race, we will end up having a file in a non-hashed subvol
+ * without a linkfile in hashed subvol.
+ *
+ * To avoid this, we store the gfid of linkfile created by client, So
+ * If we delete the linkfile , we validate gfid of existing file with
+ * stored value from posix layer.
+ *
+ * Storing this value in local->xattr_req as local->params was also used
+ * to create the data file. During the linkfile deletion we will use
+ * local->xattr_req dictionary.
+ */
+ if (!local->xattr_req) {
+ local->xattr_req = dict_new();
+ if (!local->xattr_req) {
+ local->op_errno = ENOMEM;
+ op_errno = ENOMEM;
+ goto err;
+ }
+ }
+
+ gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_char);
+ if (!gfid) {
+ local->op_errno = ENOMEM;
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ gf_uuid_copy(gfid, stbuf->ia_gfid);
+ ret = dict_set_dynptr(local->xattr_req, TIER_LINKFILE_GFID, gfid,
+ sizeof(uuid_t));
+ if (ret) {
+ GF_FREE(gfid);
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value"
+ " : key = %s",
+ TIER_LINKFILE_GFID);
+ }
+
+ STACK_WIND_COOKIE(frame, tier_create_cbk, cached_subvol, cached_subvol,
+ cached_subvol->fops->create, &local->loc, local->flags,
+ local->mode, local->umask, local->fd, local->params);
+
+ return 0;
+err:
+ DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
+}
+
+gf_boolean_t
+tier_is_hot_tier_decommissioned(xlator_t *this)
+{
+ dht_conf_t *conf = NULL;
+ xlator_t *hot_tier = NULL;
+ int i = 0;
+
+ conf = this->private;
+ hot_tier = conf->subvolumes[1];
+
+ if (conf->decommission_subvols_cnt) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->decommissioned_bricks[i] &&
+ conf->decommissioned_bricks[i] == hot_tier)
+ return _gf_true;
+ }
+ }
+
+ return _gf_false;
+}
+
+int
+tier_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *params)
+{
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ xlator_t *hot_subvol = NULL;
+ xlator_t *cold_subvol = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+
+ conf = this->private;
+
+ dht_get_du_info(frame, this, loc);
+
+ local = dht_local_init(frame, loc, fd, GF_FOP_CREATE);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ cold_subvol = TIER_HASHED_SUBVOL;
+ hot_subvol = TIER_UNHASHED_SUBVOL;
+
+ if (conf->subvolumes[0] != cold_subvol) {
+ hot_subvol = conf->subvolumes[0];
+ }
+ /*
+ * if hot tier full, write to cold.
+ * Also if hot tier is full, create in cold
+ */
+ if (dht_is_subvol_filled(this, hot_subvol) ||
+ tier_is_hot_tier_decommissioned(this)) {
+ gf_msg_debug(this->name, 0, "creating %s on %s", loc->path,
+ cold_subvol->name);
+
+ STACK_WIND_COOKIE(frame, tier_create_cbk, cold_subvol, cold_subvol,
+ cold_subvol->fops->create, loc, flags, mode, umask,
+ fd, params);
+ } else {
+ local->params = dict_ref(params);
+ local->flags = flags;
+ local->mode = mode;
+ local->umask = umask;
+ local->cached_subvol = hot_subvol;
+ local->hashed_subvol = cold_subvol;
+
+ gf_msg_debug(this->name, 0, "creating %s on %s (link at %s)", loc->path,
+ hot_subvol->name, cold_subvol->name);
+
+ dht_linkfile_create(frame, tier_create_linkfile_create_cbk, this,
+ hot_subvol, cold_subvol, loc);
+
+ goto out;
+ }
+out:
+ return 0;
+
+err:
+
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+ NULL);
+
+ return 0;
+}
+
+int
+tier_unlink_nonhashed_linkfile_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK(&frame->lock);
+ {
+ if ((op_ret == -1) && (op_errno != ENOENT)) {
+ local->op_errno = op_errno;
+ local->op_ret = op_ret;
+ gf_msg_debug(this->name, op_errno,
+ "Unlink link: subvolume %s"
+ " returned -1",
+ prev->name);
+ goto unlock;
+ }
+
+ local->op_ret = 0;
+ }
+unlock:
+ UNLOCK(&frame->lock);
+
+ if (local->op_ret == -1)
+ goto err;
+ DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, NULL);
+
+ return 0;
+
+err:
+ DHT_STACK_UNWIND(unlink, frame, -1, local->op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+tier_unlink_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *preparent, dict_t *xdata,
+ struct iatt *postparent)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ dht_conf_t *conf = NULL;
+ xlator_t *hot_subvol = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+ hot_subvol = TIER_UNHASHED_SUBVOL;
+
+ if (!op_ret) {
+ /*
+ * linkfile present on hot tier. unlinking the linkfile
+ */
+ STACK_WIND_COOKIE(frame, tier_unlink_nonhashed_linkfile_cbk, hot_subvol,
+ hot_subvol, hot_subvol->fops->unlink, &local->loc,
+ local->flags, NULL);
+ return 0;
+ }
+
+ LOCK(&frame->lock);
+ {
+ if (op_errno == ENOENT) {
+ local->op_ret = 0;
+ local->op_errno = op_errno;
+ } else {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+ gf_msg_debug(this->name, op_errno, "Lookup : subvolume %s returned -1",
+ prev->name);
+ }
+
+ UNLOCK(&frame->lock);
+
+ DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, xdata);
+
+ return 0;
+}
+
+int
+tier_unlink_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK(&frame->lock);
+ {
+ /* Ignore EINVAL for tier to ignore error when the file
+ does not exist on the other tier */
+ if ((op_ret == -1) && !((op_errno == ENOENT) || (op_errno == EINVAL))) {
+ local->op_errno = op_errno;
+ local->op_ret = op_ret;
+ gf_msg_debug(this->name, op_errno,
+ "Unlink link: subvolume %s"
+ " returned -1",
+ prev->name);
+ goto unlock;
+ }
+
+ local->op_ret = 0;
+ }
+unlock:
+ UNLOCK(&frame->lock);
+
+ if (local->op_ret == -1)
+ goto err;
+
+ DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, xdata);
+
+ return 0;
+
+err:
+ DHT_STACK_UNWIND(unlink, frame, -1, local->op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+tier_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ struct iatt *stbuf = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ xlator_t *hot_tier = NULL;
+ xlator_t *cold_tier = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ cold_tier = TIER_HASHED_SUBVOL;
+ hot_tier = TIER_UNHASHED_SUBVOL;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ if (op_errno == ENOENT) {
+ local->op_ret = 0;
+ } else {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ }
+ gf_msg_debug(this->name, op_errno,
+ "Unlink: subvolume %s returned -1"
+ " with errno = %d",
+ prev->name, op_errno);
+ goto unlock;
+ }
+
+ local->op_ret = 0;
+
+ local->postparent = *postparent;
+ local->preparent = *preparent;
+
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->preparent, 0);
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
+ }
+unlock:
+ UNLOCK(&frame->lock);
+
+ if (local->op_ret)
+ goto out;
+
+ if (cold_tier != local->cached_subvol) {
+ /*
+ * File is present in hot tier, so there will be
+ * a link file on cold tier, deleting the linkfile
+ * from cold tier
+ */
+ STACK_WIND_COOKIE(frame, tier_unlink_linkfile_cbk, cold_tier, cold_tier,
+ cold_tier->fops->unlink, &local->loc, local->flags,
+ xdata);
+ return 0;
+ }
+
+ ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
+ if (!ret && stbuf &&
+ ((IS_DHT_MIGRATION_PHASE2(stbuf)) || IS_DHT_MIGRATION_PHASE1(stbuf))) {
+ /*
+ * File is migrating from cold to hot tier.
+ * Delete the destination linkfile.
+ */
+ STACK_WIND_COOKIE(frame, tier_unlink_lookup_cbk, hot_tier, hot_tier,
+ hot_tier->fops->lookup, &local->loc, NULL);
+ return 0;
+ }
+
+out:
+ DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, xdata);
+
+ return 0;
+}
+
+int
+tier_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata)
+{
+ xlator_t *cached_subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ int ret = -1;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_UNLINK);
+ if (!local) {
+ op_errno = ENOMEM;
+
+ goto err;
+ }
+
+ hashed_subvol = TIER_HASHED_SUBVOL;
+
+ cached_subvol = local->cached_subvol;
+ if (!cached_subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local->flags = xflag;
+ if (IA_ISREG(loc->inode->ia_type) && (hashed_subvol == cached_subvol)) {
+ /*
+ * File resides in cold tier. We need to stat
+ * the file to see if it is being promoted.
+ * If yes we need to delete the destination
+ * file as well.
+ *
+ * Currently we are doing this check only for
+ * regular files.
+ */
+ xdata = xdata ? dict_ref(xdata) : dict_new();
+ if (xdata) {
+ ret = dict_set_int8(xdata, DHT_IATT_IN_XDATA_KEY, 1);
+ if (ret) {
+ gf_msg_debug(this->name, 0, "Failed to set dictionary key %s",
+ DHT_IATT_IN_XDATA_KEY);
+ }
+ }
+ }
+
+ /*
+ * File is on hot tier, delete the data file first, then
+ * linkfile from cold.
+ */
+ STACK_WIND_COOKIE(frame, tier_unlink_cbk, cached_subvol, cached_subvol,
+ cached_subvol->fops->unlink, loc, xflag, xdata);
+ if (xdata)
+ dict_unref(xdata);
+ return 0;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+tier_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
+{
+ gf_dirent_t entries;
+ gf_dirent_t *orig_entry = NULL;
+ gf_dirent_t *entry = NULL;
+ int count = 0;
+
+ INIT_LIST_HEAD(&entries.list);
+
+ if (op_ret < 0)
+ goto unwind;
+
+ list_for_each_entry(orig_entry, (&orig_entries->list), list)
+ {
+ entry = gf_dirent_for_name(orig_entry->d_name);
+ if (!entry) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "Memory allocation failed ");
+ goto unwind;
+ }
+
+ entry->d_off = orig_entry->d_off;
+ entry->d_ino = orig_entry->d_ino;
+ entry->d_type = orig_entry->d_type;
+ entry->d_len = orig_entry->d_len;
+
+ list_add_tail(&entry->list, &entries.list);
+ count++;
+ }
+ op_ret = count;
+
+unwind:
+ if (op_ret < 0)
+ op_ret = 0;
+
+ DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, NULL);
+
+ gf_dirent_free(&entries);
+
+ return 0;
+}
+
+int
+tier_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ gf_dirent_t entries;
+ gf_dirent_t *orig_entry = NULL;
+ gf_dirent_t *entry = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *next_subvol = NULL;
+ off_t next_offset = 0;
+ int count = 0;
+ dht_conf_t *conf = NULL;
+ int ret = 0;
+ inode_table_t *itable = NULL;
+ inode_t *inode = NULL;
+
+ INIT_LIST_HEAD(&entries.list);
+ prev = cookie;
+ local = frame->local;
+ itable = local->fd ? local->fd->inode->table : NULL;
+
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, unwind);
+
+ if (op_ret < 0)
+ goto done;
+
+ list_for_each_entry(orig_entry, (&orig_entries->list), list)
+ {
+ next_offset = orig_entry->d_off;
+
+ if (IA_ISINVAL(orig_entry->d_stat.ia_type)) {
+ /*stat failed somewhere- ignore this entry*/
+ continue;
+ }
+
+ entry = gf_dirent_for_name(orig_entry->d_name);
+ if (!entry) {
+ goto unwind;
+ }
+
+ entry->d_off = orig_entry->d_off;
+ entry->d_stat = orig_entry->d_stat;
+ entry->d_ino = orig_entry->d_ino;
+ entry->d_type = orig_entry->d_type;
+ entry->d_len = orig_entry->d_len;
+
+ if (orig_entry->dict)
+ entry->dict = dict_ref(orig_entry->dict);
+
+ if (check_is_linkfile(NULL, (&orig_entry->d_stat), orig_entry->dict,
+ conf->link_xattr_name)) {
+ goto entries;
+
+ } else if (IA_ISDIR(entry->d_stat.ia_type)) {
+ if (orig_entry->inode) {
+ dht_inode_ctx_time_update(orig_entry->inode, this,
+ &entry->d_stat, 1);
+ }
+ } else {
+ if (orig_entry->inode) {
+ ret = dht_layout_preset(this, prev, orig_entry->inode);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LAYOUT_SET_FAILED,
+ "failed to link the layout "
+ "in inode");
+
+ entry->inode = inode_ref(orig_entry->inode);
+ } else if (itable) {
+ /*
+ * orig_entry->inode might be null if any upper
+ * layer xlators below client set to null, to
+ * force a lookup on the inode even if the inode
+ * is present in the inode table. In that case
+ * we just update the ctx to make sure we didn't
+ * missed anything.
+ */
+ inode = inode_find(itable, orig_entry->d_stat.ia_gfid);
+ if (inode) {
+ ret = dht_layout_preset(this, TIER_HASHED_SUBVOL, inode);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LAYOUT_SET_FAILED,
+ "failed to link the layout"
+ " in inode");
+ inode_unref(inode);
+ inode = NULL;
+ }
+ }
+ }
+
+ entries:
+ list_add_tail(&entry->list, &entries.list);
+ count++;
+ }
+ op_ret = count;
+
+done:
+ if (count == 0) {
+ /* non-zero next_offset means that
+ EOF is not yet hit on the current subvol
+ */
+ if (next_offset != 0) {
+ next_subvol = prev;
+ } else {
+ goto unwind;
+ }
+
+ STACK_WIND_COOKIE(frame, tier_readdirp_cbk, next_subvol, next_subvol,
+ next_subvol->fops->readdirp, local->fd, local->size,
+ next_offset, local->xattr);
+ return 0;
+ }
+
+unwind:
+ if (op_ret < 0)
+ op_ret = 0;
+
+ DHT_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries, NULL);
+
+ gf_dirent_free(&entries);
+
+ return 0;
+}
+
+int
+tier_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t yoff, int whichop, dict_t *dict)
+{
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ xlator_t *hashed_subvol = NULL;
+ int ret = 0;
+ dht_conf_t *conf = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, NULL, NULL, whichop);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->fd = fd_ref(fd);
+ local->size = size;
+ local->xattr_req = (dict) ? dict_ref(dict) : NULL;
+
+ hashed_subvol = TIER_HASHED_SUBVOL;
+
+ /* TODO: do proper readdir */
+ if (whichop == GF_FOP_READDIRP) {
+ if (dict)
+ local->xattr = dict_ref(dict);
+ else
+ local->xattr = dict_new();
+
+ if (local->xattr) {
+ ret = dict_set_uint32(local->xattr, conf->link_xattr_name, 256);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value"
+ " : key = %s",
+ conf->link_xattr_name);
+ }
+
+ STACK_WIND_COOKIE(frame, tier_readdirp_cbk, hashed_subvol,
+ hashed_subvol, hashed_subvol->fops->readdirp, fd,
+ size, yoff, local->xattr);
+
+ } else {
+ STACK_WIND_COOKIE(frame, tier_readdir_cbk, hashed_subvol, hashed_subvol,
+ hashed_subvol->fops->readdir, fd, size, yoff,
+ local->xattr);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int
+tier_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t yoff, dict_t *xdata)
+{
+ int op = GF_FOP_READDIR;
+ dht_conf_t *conf = NULL;
+ int i = 0;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->subvolume_status[i]) {
+ op = GF_FOP_READDIRP;
+ break;
+ }
+ }
+
+ if (conf->use_readdirp)
+ op = GF_FOP_READDIRP;
+
+out:
+ tier_do_readdir(frame, this, fd, size, yoff, op, 0);
+ return 0;
+}
+
+int
+tier_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t yoff, dict_t *dict)
+{
+ tier_do_readdir(frame, this, fd, size, yoff, GF_FOP_READDIRP, dict);
+ return 0;
+}
+
+int
+tier_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct statvfs *statvfs, dict_t *xdata)
+{
+ gf_boolean_t event = _gf_false;
+ qdstatfs_action_t action = qdstatfs_action_OFF;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ int bsize = 0;
+ int frsize = 0;
+ GF_UNUSED int ret = 0;
+ unsigned long new_usage = 0;
+ unsigned long cur_usage = 0;
+ xlator_t *prev = NULL;
+ dht_conf_t *conf = NULL;
+ tier_statvfs_t *tier_stat = NULL;
+
+ prev = cookie;
+ local = frame->local;
+ GF_ASSERT(local);
+
+ conf = this->private;
+
+ if (xdata)
+ ret = dict_get_int8(xdata, "quota-deem-statfs", (int8_t *)&event);
+
+ tier_stat = &local->tier_statvfs;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto unlock;
+ }
+ if (!statvfs) {
+ op_errno = EINVAL;
+ local->op_ret = -1;
+ goto unlock;
+ }
+ local->op_ret = 0;
+
+ if (local->quota_deem_statfs) {
+ if (event == _gf_true) {
+ action = qdstatfs_action_COMPARE;
+ } else {
+ action = qdstatfs_action_NEGLECT;
+ }
+ } else {
+ if (event == _gf_true) {
+ action = qdstatfs_action_REPLACE;
+ local->quota_deem_statfs = _gf_true;
+ }
+ }
+
+ if (local->quota_deem_statfs) {
+ switch (action) {
+ case qdstatfs_action_NEGLECT:
+ goto unlock;
+
+ case qdstatfs_action_REPLACE:
+ local->statvfs = *statvfs;
+ goto unlock;
+
+ case qdstatfs_action_COMPARE:
+ new_usage = statvfs->f_blocks - statvfs->f_bfree;
+ cur_usage = local->statvfs.f_blocks -
+ local->statvfs.f_bfree;
+
+ /* Take the max of the usage from subvols */
+ if (new_usage >= cur_usage)
+ local->statvfs = *statvfs;
+ goto unlock;
+
+ default:
+ break;
+ }
+ }
+
+ if (local->statvfs.f_bsize != 0) {
+ bsize = max(local->statvfs.f_bsize, statvfs->f_bsize);
+ frsize = max(local->statvfs.f_frsize, statvfs->f_frsize);
+ dht_normalize_stats(&local->statvfs, bsize, frsize);
+ dht_normalize_stats(statvfs, bsize, frsize);
+ } else {
+ local->statvfs.f_bsize = statvfs->f_bsize;
+ local->statvfs.f_frsize = statvfs->f_frsize;
+ }
+
+ if (prev == TIER_HASHED_SUBVOL) {
+ local->statvfs.f_blocks = statvfs->f_blocks;
+ local->statvfs.f_files = statvfs->f_files;
+ local->statvfs.f_fsid = statvfs->f_fsid;
+ local->statvfs.f_flag = statvfs->f_flag;
+ local->statvfs.f_namemax = statvfs->f_namemax;
+ tier_stat->blocks_used = (statvfs->f_blocks - statvfs->f_bfree);
+ tier_stat->pblocks_used = (statvfs->f_blocks - statvfs->f_bavail);
+ tier_stat->files_used = (statvfs->f_files - statvfs->f_ffree);
+ tier_stat->pfiles_used = (statvfs->f_files - statvfs->f_favail);
+ tier_stat->hashed_fsid = statvfs->f_fsid;
+ } else {
+ tier_stat->unhashed_fsid = statvfs->f_fsid;
+ tier_stat->unhashed_blocks_used = (statvfs->f_blocks -
+ statvfs->f_bfree);
+ tier_stat->unhashed_pblocks_used = (statvfs->f_blocks -
+ statvfs->f_bavail);
+ tier_stat->unhashed_files_used = (statvfs->f_files -
+ statvfs->f_ffree);
+ tier_stat->unhashed_pfiles_used = (statvfs->f_files -
+ statvfs->f_favail);
+ }
+ }
+unlock:
+ UNLOCK(&frame->lock);
+
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ if (tier_stat->unhashed_fsid != tier_stat->hashed_fsid) {
+ tier_stat->blocks_used += tier_stat->unhashed_blocks_used;
+ tier_stat->pblocks_used += tier_stat->unhashed_pblocks_used;
+ tier_stat->files_used += tier_stat->unhashed_files_used;
+ tier_stat->pfiles_used += tier_stat->unhashed_pfiles_used;
+ }
+ local->statvfs.f_bfree = local->statvfs.f_blocks -
+ tier_stat->blocks_used;
+ local->statvfs.f_bavail = local->statvfs.f_blocks -
+ tier_stat->pblocks_used;
+ local->statvfs.f_ffree = local->statvfs.f_files - tier_stat->files_used;
+ local->statvfs.f_favail = local->statvfs.f_files -
+ tier_stat->pfiles_used;
+ DHT_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno,
+ &local->statvfs, xdata);
+ }
+
+ return 0;
+}
+
+int
+tier_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int i = -1;
+ inode_t *inode = NULL;
+ inode_table_t *itable = NULL;
+ uuid_t root_gfid = {
+ 0,
+ };
+ loc_t newloc = {
+ 0,
+ };
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, NULL, NULL, GF_FOP_STATFS);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (loc->inode && !IA_ISDIR(loc->inode->ia_type)) {
+ itable = loc->inode->table;
+ if (!itable) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ loc = &local->loc2;
+ root_gfid[15] = 1;
+
+ inode = inode_find(itable, root_gfid);
+ if (!inode) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ dht_build_root_loc(inode, &newloc);
+ loc = &newloc;
+ }
+
+ local->call_cnt = conf->subvolume_cnt;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND_COOKIE(frame, tier_statfs_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->statfs, loc, xdata);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
diff --git a/xlators/cluster/dht/src/tier-common.h b/xlators/cluster/dht/src/tier-common.h
new file mode 100644
index 0000000..b1ebaa8
--- /dev/null
+++ b/xlators/cluster/dht/src/tier-common.h
@@ -0,0 +1,55 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _TIER_COMMON_H_
+#define _TIER_COMMON_H_
+/* Function definitions */
+int
+tier_create_unlink_stale_linkto_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata);
+
+int
+tier_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata);
+
+int
+tier_create_linkfile_create_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata);
+
+int
+tier_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *params);
+
+int32_t
+tier_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata);
+
+int32_t
+tier_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *dict);
+
+int
+tier_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t yoff, dict_t *xdata);
+
+int
+tier_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata);
+
+int
+tier_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
+
+#endif
diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c
new file mode 100644
index 0000000..94b4c63
--- /dev/null
+++ b/xlators/cluster/dht/src/tier.c
@@ -0,0 +1,3105 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <dlfcn.h>
+
+#include "dht-common.h"
+#include "tier.h"
+#include "tier-common.h"
+#include <glusterfs/syscall.h>
+#include <glusterfs/events.h>
+#include "tier-ctr-interface.h"
+
+/*Hard coded DB info*/
+static gfdb_db_type_t dht_tier_db_type = GFDB_SQLITE3;
+/*Hard coded DB info*/
+
+/*Mutex for updating the data movement stats*/
+static pthread_mutex_t dm_stat_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/* Stores the path location of promotion query files */
+static char *promotion_qfile;
+/* Stores the path location of demotion query files */
+static char *demotion_qfile;
+
+static void *libhandle;
+static gfdb_methods_t gfdb_methods;
+
+#define DB_QUERY_RECORD_SIZE 4096
+
+/*
+ * Closes all the fds and frees the qfile_array
+ * */
+static void
+qfile_array_free(tier_qfile_array_t *qfile_array)
+{
+ ssize_t i = 0;
+
+ if (qfile_array) {
+ if (qfile_array->fd_array) {
+ for (i = 0; i < qfile_array->array_size; i++) {
+ if (qfile_array->fd_array[i] != -1) {
+ sys_close(qfile_array->fd_array[i]);
+ }
+ }
+ }
+ GF_FREE(qfile_array->fd_array);
+ }
+ GF_FREE(qfile_array);
+}
+
+/* Create a new query file list with given size */
+static tier_qfile_array_t *
+qfile_array_new(ssize_t array_size)
+{
+ int ret = -1;
+ tier_qfile_array_t *qfile_array = NULL;
+ ssize_t i = 0;
+
+ GF_VALIDATE_OR_GOTO("tier", (array_size > 0), out);
+
+ qfile_array = GF_CALLOC(1, sizeof(tier_qfile_array_t),
+ gf_tier_mt_qfile_array_t);
+ if (!qfile_array) {
+ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to allocate memory for tier_qfile_array_t");
+ goto out;
+ }
+
+ qfile_array->fd_array = GF_MALLOC(array_size * sizeof(int),
+ gf_dht_mt_int32_t);
+ if (!qfile_array->fd_array) {
+ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to allocate memory for "
+ "tier_qfile_array_t->fd_array");
+ goto out;
+ }
+
+ /* Init all the fds to -1 */
+ for (i = 0; i < array_size; i++) {
+ qfile_array->fd_array[i] = -1;
+ }
+
+ qfile_array->array_size = array_size;
+ qfile_array->next_index = 0;
+
+ /* Set exhausted count to list size as the list is empty */
+ qfile_array->exhausted_count = qfile_array->array_size;
+
+ ret = 0;
+out:
+ if (ret) {
+ qfile_array_free(qfile_array);
+ qfile_array = NULL;
+ }
+ return qfile_array;
+}
+
+/* Checks if the query file list is empty or totally exhausted. */
+static gf_boolean_t
+is_qfile_array_empty(tier_qfile_array_t *qfile_array)
+{
+ return (qfile_array->exhausted_count == qfile_array->array_size)
+ ? _gf_true
+ : _gf_false;
+}
+
+/* Shifts the next_fd pointer to the next available fd in the list */
+static void
+shift_next_index(tier_qfile_array_t *qfile_array)
+{
+ int qfile_fd = 0;
+ int spin_count = 0;
+
+ if (is_qfile_array_empty(qfile_array)) {
+ return;
+ }
+
+ do {
+ /* change next_index in a rotional manner */
+ (qfile_array->next_index == (qfile_array->array_size - 1))
+ ? qfile_array->next_index = 0
+ : qfile_array->next_index++;
+
+ qfile_fd = (qfile_array->fd_array[qfile_array->next_index]);
+
+ spin_count++;
+
+ } while ((qfile_fd == -1) && (spin_count < qfile_array->array_size));
+}
+
+/*
+ * This is a non-thread safe function to read query records
+ * from a list of query files in a Round-Robin manner.
+ * As in when the query files get exhuasted they are closed.
+ * Returns:
+ * 0 if all the query records in all the query files of the list are
+ * exhausted.
+ * > 0 if a query record is successfully read. Indicates the size of the query
+ * record read.
+ * < 0 if there was failure
+ * */
+static int
+read_query_record_list(tier_qfile_array_t *qfile_array,
+ gfdb_query_record_t **query_record)
+{
+ int ret = -1;
+ int qfile_fd = 0;
+
+ GF_VALIDATE_OR_GOTO("tier", qfile_array, out);
+ GF_VALIDATE_OR_GOTO("tier", qfile_array->fd_array, out);
+
+ do {
+ if (is_qfile_array_empty(qfile_array)) {
+ ret = 0;
+ break;
+ }
+
+ qfile_fd = qfile_array->fd_array[qfile_array->next_index];
+ ret = gfdb_methods.gfdb_read_query_record(qfile_fd, query_record);
+ if (ret <= 0) {
+ /*The qfile_fd has reached EOF or
+ * there was an error.
+ * 1. Close the exhausted fd
+ * 2. increment the exhausted count
+ * 3. shift next_qfile to next qfile
+ **/
+ sys_close(qfile_fd);
+ qfile_array->fd_array[qfile_array->next_index] = -1;
+ qfile_array->exhausted_count++;
+ /* shift next_qfile to next qfile */
+ shift_next_index(qfile_array);
+ continue;
+ } else {
+ /* shift next_qfile to next qfile */
+ shift_next_index(qfile_array);
+ break;
+ }
+ } while (1);
+out:
+ return ret;
+}
+
+/* Check and update the watermark every WM_INTERVAL seconds */
+#define WM_INTERVAL 5
+#define WM_INTERVAL_EMERG 1
+
+static int
+tier_check_same_node(xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag)
+{
+ int ret = -1;
+ dict_t *dict = NULL;
+ char *uuid_str = NULL;
+ uuid_t node_uuid = {
+ 0,
+ };
+
+ GF_VALIDATE_OR_GOTO("tier", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, loc, out);
+ GF_VALIDATE_OR_GOTO(this->name, defrag, out);
+
+ if (syncop_getxattr(this, loc, &dict, GF_XATTR_NODE_UUID_KEY, NULL, NULL)) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Unable to get NODE_UUID_KEY %s %s\n", loc->name, loc->path);
+ goto out;
+ }
+
+ if (dict_get_str(dict, GF_XATTR_NODE_UUID_KEY, &uuid_str) < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to get node-uuids for %s", loc->path);
+ goto out;
+ }
+
+ if (gf_uuid_parse(uuid_str, node_uuid)) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "uuid_parse failed for %s", loc->path);
+ goto out;
+ }
+
+ if (gf_uuid_compare(node_uuid, defrag->node_uuid)) {
+ gf_msg_debug(this->name, 0, "%s does not belong to this node",
+ loc->path);
+ ret = 1;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (dict)
+ dict_unref(dict);
+
+ return ret;
+}
+
+int
+tier_get_fs_stat(xlator_t *this, loc_t *root_loc)
+{
+ int ret = 0;
+ gf_defrag_info_t *defrag = NULL;
+ dht_conf_t *conf = NULL;
+ dict_t *xdata = NULL;
+ struct statvfs statfs = {
+ 0,
+ };
+ gf_tier_conf_t *tier_conf = NULL;
+
+ conf = this->private;
+ if (!conf) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS,
+ "conf is NULL");
+ ret = -1;
+ goto exit;
+ }
+
+ defrag = conf->defrag;
+ if (!defrag) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS,
+ "defrag is NULL");
+ ret = -1;
+ goto exit;
+ }
+
+ tier_conf = &defrag->tier_conf;
+
+ xdata = dict_new();
+ if (!xdata) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "failed to allocate dictionary");
+ ret = -1;
+ goto exit;
+ }
+
+ ret = dict_set_int8(xdata, GF_INTERNAL_IGNORE_DEEM_STATFS, 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set " GF_INTERNAL_IGNORE_DEEM_STATFS " in dict");
+ ret = -1;
+ goto exit;
+ }
+
+ /* Find how much free space is on the hot subvolume.
+ * Then see if that value */
+ /* is less than or greater than user defined watermarks.
+ * Stash results in */
+ /* the tier_conf data structure. */
+
+ ret = syncop_statfs(conf->subvolumes[1], root_loc, &statfs, xdata, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LOG_TIER_STATUS,
+ "Unable to obtain statfs.");
+ goto exit;
+ }
+
+ pthread_mutex_lock(&dm_stat_mutex);
+
+ tier_conf->block_size = statfs.f_bsize;
+ tier_conf->blocks_total = statfs.f_blocks;
+ tier_conf->blocks_used = statfs.f_blocks - statfs.f_bfree;
+
+ tier_conf->percent_full = GF_PERCENTAGE(tier_conf->blocks_used,
+ statfs.f_blocks);
+ pthread_mutex_unlock(&dm_stat_mutex);
+
+exit:
+ if (xdata)
+ dict_unref(xdata);
+ return ret;
+}
+
+static void
+tier_send_watermark_event(const char *volname, tier_watermark_op_t old_wm,
+ tier_watermark_op_t new_wm)
+{
+ if (old_wm == TIER_WM_LOW || old_wm == TIER_WM_NONE) {
+ if (new_wm == TIER_WM_MID) {
+ gf_event(EVENT_TIER_WATERMARK_RAISED_TO_MID, "vol=%s", volname);
+ } else if (new_wm == TIER_WM_HI) {
+ gf_event(EVENT_TIER_WATERMARK_HI, "vol=%s", volname);
+ }
+ } else if (old_wm == TIER_WM_MID) {
+ if (new_wm == TIER_WM_LOW) {
+ gf_event(EVENT_TIER_WATERMARK_DROPPED_TO_LOW, "vol=%s", volname);
+ } else if (new_wm == TIER_WM_HI) {
+ gf_event(EVENT_TIER_WATERMARK_HI, "vol=%s", volname);
+ }
+ } else if (old_wm == TIER_WM_HI) {
+ if (new_wm == TIER_WM_MID) {
+ gf_event(EVENT_TIER_WATERMARK_DROPPED_TO_MID, "vol=%s", volname);
+ } else if (new_wm == TIER_WM_LOW) {
+ gf_event(EVENT_TIER_WATERMARK_DROPPED_TO_LOW, "vol=%s", volname);
+ }
+ }
+}
+
+int
+tier_check_watermark(xlator_t *this)
+{
+ int ret = -1;
+ gf_defrag_info_t *defrag = NULL;
+ dht_conf_t *conf = NULL;
+ gf_tier_conf_t *tier_conf = NULL;
+ tier_watermark_op_t wm = TIER_WM_NONE;
+
+ conf = this->private;
+ if (!conf)
+ goto exit;
+
+ defrag = conf->defrag;
+ if (!defrag)
+ goto exit;
+
+ tier_conf = &defrag->tier_conf;
+
+ if (tier_conf->percent_full < tier_conf->watermark_low) {
+ wm = TIER_WM_LOW;
+
+ } else if (tier_conf->percent_full < tier_conf->watermark_hi) {
+ wm = TIER_WM_MID;
+
+ } else {
+ wm = TIER_WM_HI;
+ }
+
+ if (wm != tier_conf->watermark_last) {
+ tier_send_watermark_event(tier_conf->volname, tier_conf->watermark_last,
+ wm);
+
+ tier_conf->watermark_last = wm;
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
+ "Tier watermark now %d", wm);
+ }
+
+ ret = 0;
+
+exit:
+ return ret;
+}
+
+static gf_boolean_t
+is_hot_tier_full(gf_tier_conf_t *tier_conf)
+{
+ if (tier_conf && (tier_conf->mode == TIER_MODE_WM) &&
+ (tier_conf->watermark_last == TIER_WM_HI))
+ return _gf_true;
+
+ return _gf_false;
+}
+
+int
+tier_do_migration(xlator_t *this, int promote)
+{
+ gf_defrag_info_t *defrag = NULL;
+ dht_conf_t *conf = NULL;
+ long rand = 0;
+ int migrate = 0;
+ gf_tier_conf_t *tier_conf = NULL;
+
+ conf = this->private;
+ if (!conf)
+ goto exit;
+
+ defrag = conf->defrag;
+ if (!defrag)
+ goto exit;
+
+ if (tier_check_watermark(this) != 0) {
+ gf_msg(this->name, GF_LOG_CRITICAL, errno, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to get watermark");
+ goto exit;
+ }
+
+ tier_conf = &defrag->tier_conf;
+
+ switch (tier_conf->watermark_last) {
+ case TIER_WM_LOW:
+ migrate = promote ? 1 : 0;
+ break;
+ case TIER_WM_HI:
+ migrate = promote ? 0 : 1;
+ break;
+ case TIER_WM_MID:
+ /* coverity[DC.WEAK_CRYPTO] */
+ rand = random() % 100;
+ if (promote) {
+ migrate = (rand > tier_conf->percent_full);
+ } else {
+ migrate = (rand <= tier_conf->percent_full);
+ }
+ break;
+ }
+
+exit:
+ return migrate;
+}
+
+int
+tier_migrate(xlator_t *this, int is_promotion, dict_t *migrate_data, loc_t *loc,
+ gf_tier_conf_t *tier_conf)
+{
+ int ret = -1;
+
+ pthread_mutex_lock(&tier_conf->pause_mutex);
+ if (is_promotion)
+ tier_conf->promote_in_progress = 1;
+ else
+ tier_conf->demote_in_progress = 1;
+ pthread_mutex_unlock(&tier_conf->pause_mutex);
+
+ /* Data migration */
+ ret = syncop_setxattr(this, loc, migrate_data, 0, NULL, NULL);
+
+ pthread_mutex_lock(&tier_conf->pause_mutex);
+ if (is_promotion)
+ tier_conf->promote_in_progress = 0;
+ else
+ tier_conf->demote_in_progress = 0;
+ pthread_mutex_unlock(&tier_conf->pause_mutex);
+
+ return ret;
+}
+
+/* returns _gf_true: if file can be promoted
+ * returns _gf_false: if file cannot be promoted
+ */
+static gf_boolean_t
+tier_can_promote_file(xlator_t *this, char const *file_name,
+ struct iatt *current, gf_defrag_info_t *defrag)
+{
+ gf_boolean_t ret = _gf_false;
+ fsblkcnt_t estimated_usage = 0;
+
+ if (defrag->tier_conf.tier_max_promote_size &&
+ (current->ia_size > defrag->tier_conf.tier_max_promote_size)) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
+ "File %s (gfid:%s) with size (%" PRIu64
+ ") exceeds maxsize "
+ "(%d) for promotion. File will not be promoted.",
+ file_name, uuid_utoa(current->ia_gfid), current->ia_size,
+ defrag->tier_conf.tier_max_promote_size);
+ goto err;
+ }
+
+ /* bypass further validations for TEST mode */
+ if (defrag->tier_conf.mode != TIER_MODE_WM) {
+ ret = _gf_true;
+ goto err;
+ }
+
+ /* convert the file size to blocks as per the block size of the
+ * destination tier
+ * NOTE: add (block_size - 1) to get the correct block size when
+ * there is a remainder after a modulo
+ */
+ estimated_usage = ((current->ia_size + defrag->tier_conf.block_size - 1) /
+ defrag->tier_conf.block_size) +
+ defrag->tier_conf.blocks_used;
+
+ /* test if the estimated block usage goes above HI watermark */
+ if (GF_PERCENTAGE(estimated_usage, defrag->tier_conf.blocks_total) >=
+ defrag->tier_conf.watermark_hi) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
+ "Estimated block count consumption on "
+ "hot tier (%" PRIu64
+ ") exceeds hi watermark (%d%%). "
+ "File will not be promoted.",
+ estimated_usage, defrag->tier_conf.watermark_hi);
+ goto err;
+ }
+ ret = _gf_true;
+err:
+ return ret;
+}
+
+static int
+tier_set_migrate_data(dict_t *migrate_data)
+{
+ int failed = 1;
+
+ failed = dict_set_str(migrate_data, GF_XATTR_FILE_MIGRATE_KEY, "force");
+ if (failed) {
+ goto bail_out;
+ }
+
+ /* Flag to suggest the xattr call is from migrator */
+ failed = dict_set_str(migrate_data, "from.migrator", "yes");
+ if (failed) {
+ goto bail_out;
+ }
+
+ /* Flag to suggest its a tiering migration
+ * The reason for this dic key-value is that
+ * promotions and demotions are multithreaded
+ * so the original frame from gf_defrag_start()
+ * is not carried. A new frame will be created when
+ * we do syncop_setxattr(). This does not have the
+ * frame->root->pid of the original frame. So we pass
+ * this dic key-value when we do syncop_setxattr() to do
+ * data migration and set the frame->root->pid to
+ * GF_CLIENT_PID_TIER_DEFRAG in dht_setxattr() just before
+ * calling dht_start_rebalance_task() */
+ failed = dict_set_str(migrate_data, TIERING_MIGRATION_KEY, "yes");
+ if (failed) {
+ goto bail_out;
+ }
+
+ failed = 0;
+
+bail_out:
+ return failed;
+}
+
+static char *
+tier_get_parent_path(xlator_t *this, loc_t *p_loc, struct iatt *par_stbuf,
+ int *per_link_status)
+{
+ int ret = -1;
+ char *parent_path = NULL;
+ dict_t *xdata_request = NULL;
+ dict_t *xdata_response = NULL;
+
+ xdata_request = dict_new();
+ if (!xdata_request) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to create xdata_request dict");
+ goto err;
+ }
+ ret = dict_set_int32(xdata_request, GET_ANCESTRY_PATH_KEY, 42);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to set value to dict : key %s \n",
+ GET_ANCESTRY_PATH_KEY);
+ goto err;
+ }
+
+ ret = syncop_lookup(this, p_loc, par_stbuf, NULL, xdata_request,
+ &xdata_response);
+ /* When the parent gfid is a stale entry, the lookup
+ * will fail and stop the demotion process.
+ * The parent gfid can be stale when a huge folder is
+ * deleted while the files within it are being migrated
+ */
+ if (ret == -ESTALE) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_STALE_LOOKUP,
+ "Stale entry in parent lookup for %s", uuid_utoa(p_loc->gfid));
+ *per_link_status = 1;
+ goto err;
+ } else if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LOG_TIER_ERROR,
+ "Error in parent lookup for %s", uuid_utoa(p_loc->gfid));
+ *per_link_status = -1;
+ goto err;
+ }
+ ret = dict_get_str(xdata_response, GET_ANCESTRY_PATH_KEY, &parent_path);
+ if (ret || !parent_path) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to get parent path for %s", uuid_utoa(p_loc->gfid));
+ *per_link_status = -1;
+ goto err;
+ }
+
+err:
+ if (xdata_request) {
+ dict_unref(xdata_request);
+ }
+
+ if (xdata_response) {
+ dict_unref(xdata_response);
+ xdata_response = NULL;
+ }
+
+ return parent_path;
+}
+
+static int
+tier_get_file_name_and_path(xlator_t *this, uuid_t gfid,
+ gfdb_link_info_t *link_info,
+ char const *parent_path, loc_t *loc,
+ int *per_link_status)
+{
+ int ret = -1;
+
+ loc->name = gf_strdup(link_info->file_name);
+ if (!loc->name) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Memory "
+ "allocation failed for %s",
+ uuid_utoa(gfid));
+ *per_link_status = -1;
+ goto err;
+ }
+ ret = gf_asprintf((char **)&(loc->path), "%s/%s", parent_path, loc->name);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to "
+ "construct file path for %s %s\n",
+ parent_path, loc->name);
+ *per_link_status = -1;
+ goto err;
+ }
+
+ ret = 0;
+
+err:
+ return ret;
+}
+
+static int
+tier_lookup_file(xlator_t *this, loc_t *p_loc, loc_t *loc, struct iatt *current,
+ int *per_link_status)
+{
+ int ret = -1;
+
+ ret = syncop_lookup(this, loc, current, NULL, NULL, NULL);
+
+ /* The file may be deleted even when the parent
+ * is available and the lookup will
+ * return a stale entry which would stop the
+ * migration. so if its a stale entry, then skip
+ * the file and keep migrating.
+ */
+ if (ret == -ESTALE) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_STALE_LOOKUP,
+ "Stale lookup for %s", uuid_utoa(p_loc->gfid));
+ *per_link_status = 1;
+ goto err;
+ } else if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to "
+ "lookup file %s\n",
+ loc->name);
+ *per_link_status = -1;
+ goto err;
+ }
+ ret = 0;
+
+err:
+ return ret;
+}
+
+static gf_boolean_t
+tier_is_file_already_at_destination(xlator_t *src_subvol,
+ query_cbk_args_t *query_cbk_args,
+ dht_conf_t *conf, int *per_link_status)
+{
+ gf_boolean_t at_destination = _gf_true;
+
+ if (src_subvol == NULL) {
+ *per_link_status = 1;
+ goto err;
+ }
+ if (query_cbk_args->is_promotion && src_subvol == conf->subvolumes[1]) {
+ *per_link_status = 1;
+ goto err;
+ }
+
+ if (!query_cbk_args->is_promotion && src_subvol == conf->subvolumes[0]) {
+ *per_link_status = 1;
+ goto err;
+ }
+ at_destination = _gf_false;
+
+err:
+ return at_destination;
+}
+
+static void
+tier_update_migration_counters(query_cbk_args_t *query_cbk_args,
+ gf_defrag_info_t *defrag,
+ uint64_t *total_migrated_bytes, int *total_files)
+{
+ if (query_cbk_args->is_promotion) {
+ defrag->total_files_promoted++;
+ *total_migrated_bytes += defrag->tier_conf.st_last_promoted_size;
+ pthread_mutex_lock(&dm_stat_mutex);
+ defrag->tier_conf.blocks_used += defrag->tier_conf
+ .st_last_promoted_size;
+ pthread_mutex_unlock(&dm_stat_mutex);
+ } else {
+ defrag->total_files_demoted++;
+ *total_migrated_bytes += defrag->tier_conf.st_last_demoted_size;
+ pthread_mutex_lock(&dm_stat_mutex);
+ defrag->tier_conf.blocks_used -= defrag->tier_conf.st_last_demoted_size;
+ pthread_mutex_unlock(&dm_stat_mutex);
+ }
+ if (defrag->tier_conf.blocks_total) {
+ pthread_mutex_lock(&dm_stat_mutex);
+ defrag->tier_conf.percent_full = GF_PERCENTAGE(
+ defrag->tier_conf.blocks_used, defrag->tier_conf.blocks_total);
+ pthread_mutex_unlock(&dm_stat_mutex);
+ }
+
+ (*total_files)++;
+}
+
+static int
+tier_migrate_link(xlator_t *this, dht_conf_t *conf, uuid_t gfid,
+ gfdb_link_info_t *link_info, gf_defrag_info_t *defrag,
+ query_cbk_args_t *query_cbk_args, dict_t *migrate_data,
+ int *per_link_status, int *total_files,
+ uint64_t *total_migrated_bytes)
+{
+ int ret = -1;
+ struct iatt current = {
+ 0,
+ };
+ struct iatt par_stbuf = {
+ 0,
+ };
+ loc_t p_loc = {
+ 0,
+ };
+ loc_t loc = {
+ 0,
+ };
+ xlator_t *src_subvol = NULL;
+ inode_t *linked_inode = NULL;
+ char *parent_path = NULL;
+
+ /* Lookup for parent and get the path of parent */
+ gf_uuid_copy(p_loc.gfid, link_info->pargfid);
+ p_loc.inode = inode_new(defrag->root_inode->table);
+ if (!p_loc.inode) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to create reference to inode"
+ " for %s",
+ uuid_utoa(p_loc.gfid));
+
+ *per_link_status = -1;
+ goto err;
+ }
+
+ parent_path = tier_get_parent_path(this, &p_loc, &par_stbuf,
+ per_link_status);
+ if (!parent_path) {
+ goto err;
+ }
+
+ linked_inode = inode_link(p_loc.inode, NULL, NULL, &par_stbuf);
+ inode_unref(p_loc.inode);
+ p_loc.inode = linked_inode;
+
+ /* Preparing File Inode */
+ gf_uuid_copy(loc.gfid, gfid);
+ loc.inode = inode_new(defrag->root_inode->table);
+ gf_uuid_copy(loc.pargfid, link_info->pargfid);
+ loc.parent = inode_ref(p_loc.inode);
+
+ /* Get filename and Construct file path */
+ if (tier_get_file_name_and_path(this, gfid, link_info, parent_path, &loc,
+ per_link_status) != 0) {
+ goto err;
+ }
+ gf_uuid_copy(loc.parent->gfid, link_info->pargfid);
+
+ /* lookup file inode */
+ if (tier_lookup_file(this, &p_loc, &loc, &current, per_link_status) != 0) {
+ goto err;
+ }
+
+ if (query_cbk_args->is_promotion) {
+ if (!tier_can_promote_file(this, link_info->file_name, &current,
+ defrag)) {
+ *per_link_status = 1;
+ goto err;
+ }
+ }
+
+ linked_inode = inode_link(loc.inode, NULL, NULL, &current);
+ inode_unref(loc.inode);
+ loc.inode = linked_inode;
+
+ /*
+ * Do not promote/demote if file already is where it
+ * should be. It means another brick moved the file
+ * so is not an error. So we set per_link_status = 1
+ * so that we ignore counting this.
+ */
+ src_subvol = dht_subvol_get_cached(this, loc.inode);
+
+ if (tier_is_file_already_at_destination(src_subvol, query_cbk_args, conf,
+ per_link_status)) {
+ goto err;
+ }
+
+ gf_msg_debug(this->name, 0, "Tier %s: src_subvol %s file %s",
+ (query_cbk_args->is_promotion ? "promote" : "demote"),
+ src_subvol->name, loc.path);
+
+ ret = tier_check_same_node(this, &loc, defrag);
+ if (ret != 0) {
+ if (ret < 0) {
+ *per_link_status = -1;
+ goto err;
+ }
+ ret = 0;
+ /* By setting per_link_status to 1 we are
+ * ignoring this status and will not be counting
+ * this file for migration */
+ *per_link_status = 1;
+ goto err;
+ }
+
+ gf_uuid_copy(loc.gfid, loc.inode->gfid);
+
+ if (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
+ "Tiering paused. "
+ "Exiting tier_migrate_link");
+ goto err;
+ }
+
+ ret = tier_migrate(this, query_cbk_args->is_promotion, migrate_data, &loc,
+ &defrag->tier_conf);
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to "
+ "migrate %s ",
+ loc.path);
+ *per_link_status = -1;
+ goto err;
+ }
+
+ tier_update_migration_counters(query_cbk_args, defrag, total_migrated_bytes,
+ total_files);
+
+ ret = 0;
+
+err:
+ GF_FREE((char *)loc.name);
+ loc.name = NULL;
+ loc_wipe(&loc);
+ loc_wipe(&p_loc);
+
+ if ((*total_files >= defrag->tier_conf.max_migrate_files) ||
+ (*total_migrated_bytes > defrag->tier_conf.max_migrate_bytes)) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
+ "Reached cycle migration limit."
+ "migrated bytes %" PRId64 " files %d",
+ *total_migrated_bytes, *total_files);
+ ret = -1;
+ }
+
+ return ret;
+}
+
+static int
+tier_migrate_using_query_file(void *_args)
+{
+ int ret = -1;
+ query_cbk_args_t *query_cbk_args = (query_cbk_args_t *)_args;
+ xlator_t *this = NULL;
+ gf_defrag_info_t *defrag = NULL;
+ gfdb_query_record_t *query_record = NULL;
+ gfdb_link_info_t *link_info = NULL;
+ dict_t *migrate_data = NULL;
+ /*
+ * per_file_status and per_link_status
+ * 0 : success
+ * -1 : failure
+ * 1 : ignore the status and don't count for migration
+ * */
+ int per_file_status = 0;
+ int per_link_status = 0;
+ int total_status = 0;
+ dht_conf_t *conf = NULL;
+ uint64_t total_migrated_bytes = 0;
+ int total_files = 0;
+ loc_t root_loc = {0};
+ gfdb_time_t start_time = {0};
+ gfdb_time_t current_time = {0};
+ int total_time = 0;
+ int max_time = 0;
+ gf_boolean_t emergency_demote_mode = _gf_false;
+
+ GF_VALIDATE_OR_GOTO("tier", query_cbk_args, out);
+ GF_VALIDATE_OR_GOTO("tier", query_cbk_args->this, out);
+ this = query_cbk_args->this;
+ GF_VALIDATE_OR_GOTO(this->name, query_cbk_args->defrag, out);
+ GF_VALIDATE_OR_GOTO(this->name, query_cbk_args->qfile_array, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ conf = this->private;
+
+ defrag = query_cbk_args->defrag;
+ migrate_data = dict_new();
+ if (!migrate_data)
+ goto out;
+
+ emergency_demote_mode = (!query_cbk_args->is_promotion &&
+ is_hot_tier_full(&defrag->tier_conf));
+
+ if (tier_set_migrate_data(migrate_data) != 0) {
+ goto out;
+ }
+
+ dht_build_root_loc(defrag->root_inode, &root_loc);
+
+ ret = gettimeofday(&start_time, NULL);
+ if (query_cbk_args->is_promotion) {
+ max_time = defrag->tier_conf.tier_promote_frequency;
+ } else {
+ max_time = defrag->tier_conf.tier_demote_frequency;
+ }
+
+ /* Per file */
+ while ((ret = read_query_record_list(query_cbk_args->qfile_array,
+ &query_record)) != 0) {
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to fetch query record "
+ "from query file");
+ goto out;
+ }
+
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Exiting tier migration as"
+ "defrag status is not started");
+ goto out;
+ }
+
+ ret = gettimeofday(&current_time, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Could not get current time.");
+ goto out;
+ }
+
+ total_time = current_time.tv_sec - start_time.tv_sec;
+ if (total_time > max_time) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
+ "Max cycle time reached. Exiting migration.");
+ goto out;
+ }
+
+ per_file_status = 0;
+ per_link_status = 0;
+
+ if (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
+ "Tiering paused. "
+ "Exiting tier_migrate_using_query_file");
+ break;
+ }
+
+ if (defrag->tier_conf.mode == TIER_MODE_WM) {
+ ret = tier_get_fs_stat(this, &root_loc);
+ if (ret != 0) {
+ gfdb_methods.gfdb_query_record_free(query_record);
+ query_record = NULL;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS,
+ "tier_get_fs_stat() FAILED ... "
+ "skipping file migrations until next cycle");
+ break;
+ }
+
+ if (!tier_do_migration(this, query_cbk_args->is_promotion)) {
+ gfdb_methods.gfdb_query_record_free(query_record);
+ query_record = NULL;
+
+ /* We have crossed the high watermark. Stop processing
+ * files if this is a promotion cycle so demotion gets
+ * a chance to start if not already running*/
+
+ if (query_cbk_args->is_promotion &&
+ is_hot_tier_full(&defrag->tier_conf)) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
+ "High watermark crossed during "
+ "promotion. Exiting "
+ "tier_migrate_using_query_file");
+ break;
+ }
+ continue;
+ }
+ }
+
+ per_link_status = 0;
+
+ /* For now we only support single link migration. And we will
+ * ignore other hard links in the link info list of query record
+ * TODO: Multiple hard links migration */
+ if (!list_empty(&query_record->link_list)) {
+ link_info = list_first_entry(&query_record->link_list,
+ gfdb_link_info_t, list);
+ }
+ if (link_info != NULL) {
+ if (tier_migrate_link(this, conf, query_record->gfid, link_info,
+ defrag, query_cbk_args, migrate_data,
+ &per_link_status, &total_files,
+ &total_migrated_bytes) != 0) {
+ gf_msg(
+ this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
+ "%s failed for %s(gfid:%s)",
+ (query_cbk_args->is_promotion ? "Promotion" : "Demotion"),
+ link_info->file_name, uuid_utoa(query_record->gfid));
+ }
+ }
+ per_file_status = per_link_status;
+
+ if (per_file_status < 0) { /* Failure */
+ pthread_mutex_lock(&dm_stat_mutex);
+ defrag->total_failures++;
+ pthread_mutex_unlock(&dm_stat_mutex);
+ } else if (per_file_status == 0) { /* Success */
+ pthread_mutex_lock(&dm_stat_mutex);
+ defrag->total_files++;
+ pthread_mutex_unlock(&dm_stat_mutex);
+ } else if (per_file_status == 1) { /* Ignore */
+ per_file_status = 0;
+ /* Since this attempt was ignored we
+ * decrement the lookup count*/
+ pthread_mutex_lock(&dm_stat_mutex);
+ defrag->num_files_lookedup--;
+ pthread_mutex_unlock(&dm_stat_mutex);
+ }
+ total_status = total_status + per_file_status;
+ per_link_status = 0;
+ per_file_status = 0;
+
+ gfdb_methods.gfdb_query_record_free(query_record);
+ query_record = NULL;
+
+ /* If we are demoting and the entry watermark was HI, then
+ * we are done with emergency demotions if the current
+ * watermark has fallen below hi-watermark level
+ */
+ if (emergency_demote_mode) {
+ if (tier_check_watermark(this) == 0) {
+ if (!is_hot_tier_full(&defrag->tier_conf)) {
+ break;
+ }
+ }
+ }
+ }
+
+out:
+ if (migrate_data)
+ dict_unref(migrate_data);
+
+ gfdb_methods.gfdb_query_record_free(query_record);
+ query_record = NULL;
+
+ return total_status;
+}
+
+/* This is the call back function per record/file from data base */
+static int
+tier_gf_query_callback(gfdb_query_record_t *gfdb_query_record, void *_args)
+{
+ int ret = -1;
+ query_cbk_args_t *query_cbk_args = _args;
+
+ GF_VALIDATE_OR_GOTO("tier", query_cbk_args, out);
+ GF_VALIDATE_OR_GOTO("tier", query_cbk_args->defrag, out);
+ GF_VALIDATE_OR_GOTO("tier", (query_cbk_args->query_fd > 0), out);
+
+ ret = gfdb_methods.gfdb_write_query_record(query_cbk_args->query_fd,
+ gfdb_query_record);
+ if (ret) {
+ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed writing query record to query file");
+ goto out;
+ }
+
+ pthread_mutex_lock(&dm_stat_mutex);
+ query_cbk_args->defrag->num_files_lookedup++;
+ pthread_mutex_unlock(&dm_stat_mutex);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/* Create query file in tier process */
+static int
+tier_process_self_query(tier_brick_list_t *local_brick, void *args)
+{
+ int ret = -1;
+ char *db_path = NULL;
+ query_cbk_args_t *query_cbk_args = NULL;
+ xlator_t *this = NULL;
+ gfdb_conn_node_t *conn_node = NULL;
+ dict_t *params_dict = NULL;
+ dict_t *ctr_ipc_dict = NULL;
+ gfdb_brick_info_t *gfdb_brick_info = args;
+
+ /*Init of all the essentials*/
+ GF_VALIDATE_OR_GOTO("tier", gfdb_brick_info, out);
+ query_cbk_args = gfdb_brick_info->_query_cbk_args;
+
+ GF_VALIDATE_OR_GOTO("tier", query_cbk_args->this, out);
+ this = query_cbk_args->this;
+
+ GF_VALIDATE_OR_GOTO(this->name, gfdb_brick_info->_query_cbk_args, out);
+
+ GF_VALIDATE_OR_GOTO(this->name, local_brick, out);
+
+ GF_VALIDATE_OR_GOTO(this->name, local_brick->xlator, out);
+
+ GF_VALIDATE_OR_GOTO(this->name, local_brick->brick_db_path, out);
+
+ db_path = local_brick->brick_db_path;
+
+ /*Preparing DB parameters before init_db i.e getting db connection*/
+ params_dict = dict_new();
+ if (!params_dict) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "DB Params cannot initialized");
+ goto out;
+ }
+ SET_DB_PARAM_TO_DICT(this->name, params_dict,
+ (char *)gfdb_methods.get_db_path_key(), db_path, ret,
+ out);
+
+ /*Get the db connection*/
+ conn_node = gfdb_methods.init_db((void *)params_dict, dht_tier_db_type);
+ if (!conn_node) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "FATAL: Failed initializing db operations");
+ goto out;
+ }
+
+ /* Query for eligible files from db */
+ query_cbk_args->query_fd = open(local_brick->qfile_path,
+ O_WRONLY | O_CREAT | O_APPEND,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+ if (query_cbk_args->query_fd < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, errno, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to open query file %s", local_brick->qfile_path);
+ goto out;
+ }
+ if (!gfdb_brick_info->_gfdb_promote) {
+ if (query_cbk_args->defrag->tier_conf.watermark_last == TIER_WM_HI) {
+ /* emergency demotion mode */
+ ret = gfdb_methods.find_all(
+ conn_node, tier_gf_query_callback, (void *)query_cbk_args,
+ query_cbk_args->defrag->tier_conf.query_limit);
+ } else {
+ if (query_cbk_args->defrag->write_freq_threshold == 0 &&
+ query_cbk_args->defrag->read_freq_threshold == 0) {
+ ret = gfdb_methods.find_unchanged_for_time(
+ conn_node, tier_gf_query_callback, (void *)query_cbk_args,
+ gfdb_brick_info->time_stamp);
+ } else {
+ ret = gfdb_methods.find_unchanged_for_time_freq(
+ conn_node, tier_gf_query_callback, (void *)query_cbk_args,
+ gfdb_brick_info->time_stamp,
+ query_cbk_args->defrag->write_freq_threshold,
+ query_cbk_args->defrag->read_freq_threshold, _gf_false);
+ }
+ }
+ } else {
+ if (query_cbk_args->defrag->write_freq_threshold == 0 &&
+ query_cbk_args->defrag->read_freq_threshold == 0) {
+ ret = gfdb_methods.find_recently_changed_files(
+ conn_node, tier_gf_query_callback, (void *)query_cbk_args,
+ gfdb_brick_info->time_stamp);
+ } else {
+ ret = gfdb_methods.find_recently_changed_files_freq(
+ conn_node, tier_gf_query_callback, (void *)query_cbk_args,
+ gfdb_brick_info->time_stamp,
+ query_cbk_args->defrag->write_freq_threshold,
+ query_cbk_args->defrag->read_freq_threshold, _gf_false);
+ }
+ }
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "FATAL: query from db failed");
+ goto out;
+ }
+
+ /*Clear the heat on the DB entries*/
+ /*Preparing ctr_ipc_dict*/
+ ctr_ipc_dict = dict_new();
+ if (!ctr_ipc_dict) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "ctr_ipc_dict cannot initialized");
+ goto out;
+ }
+
+ SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_dict, GFDB_IPC_CTR_KEY,
+ GFDB_IPC_CTR_CLEAR_OPS, ret, out);
+
+ ret = syncop_ipc(local_brick->xlator, GF_IPC_TARGET_CTR, ctr_ipc_dict,
+ NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed clearing the heat "
+ "on db %s error %d",
+ local_brick->brick_db_path, ret);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (params_dict) {
+ dict_unref(params_dict);
+ params_dict = NULL;
+ }
+
+ if (ctr_ipc_dict) {
+ dict_unref(ctr_ipc_dict);
+ ctr_ipc_dict = NULL;
+ }
+
+ if (query_cbk_args && query_cbk_args->query_fd >= 0) {
+ sys_close(query_cbk_args->query_fd);
+ query_cbk_args->query_fd = -1;
+ }
+ gfdb_methods.fini_db(conn_node);
+
+ return ret;
+}
+
+/*Ask CTR to create the query file*/
+static int
+tier_process_ctr_query(tier_brick_list_t *local_brick, void *args)
+{
+ int ret = -1;
+ query_cbk_args_t *query_cbk_args = NULL;
+ xlator_t *this = NULL;
+ dict_t *ctr_ipc_in_dict = NULL;
+ dict_t *ctr_ipc_out_dict = NULL;
+ gfdb_brick_info_t *gfdb_brick_info = args;
+ gfdb_ipc_ctr_params_t *ipc_ctr_params = NULL;
+ int count = 0;
+
+ /*Init of all the essentials*/
+ GF_VALIDATE_OR_GOTO("tier", gfdb_brick_info, out);
+ query_cbk_args = gfdb_brick_info->_query_cbk_args;
+
+ GF_VALIDATE_OR_GOTO("tier", query_cbk_args->this, out);
+ this = query_cbk_args->this;
+
+ GF_VALIDATE_OR_GOTO(this->name, gfdb_brick_info->_query_cbk_args, out);
+
+ GF_VALIDATE_OR_GOTO(this->name, local_brick, out);
+
+ GF_VALIDATE_OR_GOTO(this->name, local_brick->xlator, out);
+
+ GF_VALIDATE_OR_GOTO(this->name, local_brick->brick_db_path, out);
+
+ /*Preparing ctr_ipc_in_dict*/
+ ctr_ipc_in_dict = dict_new();
+ if (!ctr_ipc_in_dict) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "ctr_ipc_in_dict cannot initialized");
+ goto out;
+ }
+
+ ipc_ctr_params = GF_CALLOC(1, sizeof(gfdb_ipc_ctr_params_t),
+ gf_tier_mt_ipc_ctr_params_t);
+ if (!ipc_ctr_params) {
+ goto out;
+ }
+
+ /* set all the query params*/
+ ipc_ctr_params->is_promote = gfdb_brick_info->_gfdb_promote;
+
+ ipc_ctr_params->write_freq_threshold = query_cbk_args->defrag
+ ->write_freq_threshold;
+
+ ipc_ctr_params->read_freq_threshold = query_cbk_args->defrag
+ ->read_freq_threshold;
+
+ ipc_ctr_params->query_limit = query_cbk_args->defrag->tier_conf.query_limit;
+
+ ipc_ctr_params->emergency_demote = (!gfdb_brick_info->_gfdb_promote &&
+ query_cbk_args->defrag->tier_conf
+ .watermark_last == TIER_WM_HI);
+
+ memcpy(&ipc_ctr_params->time_stamp, gfdb_brick_info->time_stamp,
+ sizeof(gfdb_time_t));
+
+ SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_in_dict, GFDB_IPC_CTR_KEY,
+ GFDB_IPC_CTR_QUERY_OPS, ret, out);
+
+ SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_in_dict,
+ GFDB_IPC_CTR_GET_QFILE_PATH, local_brick->qfile_path,
+ ret, out);
+
+ ret = dict_set_bin(ctr_ipc_in_dict, GFDB_IPC_CTR_GET_QUERY_PARAMS,
+ ipc_ctr_params, sizeof(*ipc_ctr_params));
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED,
+ "Failed setting %s to params dictionary",
+ GFDB_IPC_CTR_GET_QUERY_PARAMS);
+ GF_FREE(ipc_ctr_params);
+ goto out;
+ }
+ ipc_ctr_params = NULL;
+
+ ret = syncop_ipc(local_brick->xlator, GF_IPC_TARGET_CTR, ctr_ipc_in_dict,
+ &ctr_ipc_out_dict);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_IPC_TIER_ERROR,
+ "Failed query on %s ret %d", local_brick->brick_db_path, ret);
+ goto out;
+ }
+
+ ret = dict_get_int32(ctr_ipc_out_dict, GFDB_IPC_CTR_RET_QUERY_COUNT,
+ &count);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed getting count "
+ "of records on %s",
+ local_brick->brick_db_path);
+ goto out;
+ }
+
+ if (count < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed query on %s", local_brick->brick_db_path);
+ ret = -1;
+ goto out;
+ }
+
+ pthread_mutex_lock(&dm_stat_mutex);
+ query_cbk_args->defrag->num_files_lookedup = count;
+ pthread_mutex_unlock(&dm_stat_mutex);
+
+ ret = 0;
+out:
+
+ if (ctr_ipc_in_dict) {
+ dict_unref(ctr_ipc_in_dict);
+ ctr_ipc_in_dict = NULL;
+ }
+
+ if (ctr_ipc_out_dict) {
+ dict_unref(ctr_ipc_out_dict);
+ ctr_ipc_out_dict = NULL;
+ }
+
+ GF_FREE(ipc_ctr_params);
+
+ return ret;
+}
+
+/* This is the call back function for each brick from hot/cold bricklist
+ * It picks up each bricks db and queries for eligible files for migration.
+ * The list of eligible files are populated in appropriate query files*/
+static int
+tier_process_brick(tier_brick_list_t *local_brick, void *args)
+{
+ int ret = -1;
+ dict_t *ctr_ipc_in_dict = NULL;
+ dict_t *ctr_ipc_out_dict = NULL;
+ char *strval = NULL;
+
+ GF_VALIDATE_OR_GOTO("tier", local_brick, out);
+
+ GF_VALIDATE_OR_GOTO("tier", local_brick->xlator, out);
+
+ if (dht_tier_db_type == GFDB_SQLITE3) {
+ /*Preparing ctr_ipc_in_dict*/
+ ctr_ipc_in_dict = dict_new();
+ if (!ctr_ipc_in_dict) {
+ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "ctr_ipc_in_dict cannot initialized");
+ goto out;
+ }
+
+ ret = dict_set_str(ctr_ipc_in_dict, GFDB_IPC_CTR_KEY,
+ GFDB_IPC_CTR_GET_DB_PARAM_OPS);
+ if (ret) {
+ gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED,
+ "Failed to set %s "
+ "to params dictionary",
+ GFDB_IPC_CTR_KEY);
+ goto out;
+ }
+
+ ret = dict_set_str(ctr_ipc_in_dict, GFDB_IPC_CTR_GET_DB_PARAM_OPS, "");
+ if (ret) {
+ gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED,
+ "Failed to set %s "
+ "to params dictionary",
+ GFDB_IPC_CTR_GET_DB_PARAM_OPS);
+ goto out;
+ }
+
+ ret = dict_set_str(ctr_ipc_in_dict, GFDB_IPC_CTR_GET_DB_KEY,
+ "journal_mode");
+ if (ret) {
+ gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED,
+ "Failed to set %s "
+ "to params dictionary",
+ GFDB_IPC_CTR_GET_DB_KEY);
+ goto out;
+ }
+
+ ret = syncop_ipc(local_brick->xlator, GF_IPC_TARGET_CTR,
+ ctr_ipc_in_dict, &ctr_ipc_out_dict);
+ if (ret || ctr_ipc_out_dict == NULL) {
+ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to get "
+ "journal_mode of sql db %s",
+ local_brick->brick_db_path);
+ goto out;
+ }
+
+ ret = dict_get_str(ctr_ipc_out_dict, "journal_mode", &strval);
+ if (ret) {
+ gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_GET_PARAM_FAILED,
+ "Failed to get %s "
+ "from params dictionary"
+ "journal_mode",
+ strval);
+ goto out;
+ }
+
+ if (strval && (strncmp(strval, "wal", SLEN("wal")) == 0)) {
+ ret = tier_process_self_query(local_brick, args);
+ if (ret) {
+ goto out;
+ }
+ } else {
+ ret = tier_process_ctr_query(local_brick, args);
+ if (ret) {
+ goto out;
+ }
+ }
+ ret = 0;
+
+ } else {
+ ret = tier_process_self_query(local_brick, args);
+ if (ret) {
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ if (ctr_ipc_in_dict)
+ dict_unref(ctr_ipc_in_dict);
+
+ if (ctr_ipc_out_dict)
+ dict_unref(ctr_ipc_out_dict);
+
+ return ret;
+}
+
+static int
+tier_build_migration_qfile(migration_args_t *args,
+ query_cbk_args_t *query_cbk_args,
+ gf_boolean_t is_promotion)
+{
+ gfdb_time_t current_time;
+ gfdb_brick_info_t gfdb_brick_info;
+ gfdb_time_t time_in_past;
+ int ret = -1;
+ tier_brick_list_t *local_brick = NULL;
+ int i = 0;
+ time_in_past.tv_sec = args->freq_time;
+ time_in_past.tv_usec = 0;
+
+ ret = gettimeofday(&current_time, NULL);
+ if (ret == -1) {
+ gf_msg(args->this->name, GF_LOG_ERROR, errno,
+ DHT_MSG_SYS_CALL_GET_TIME_FAILED, "Failed to get current time");
+ goto out;
+ }
+ time_in_past.tv_sec = current_time.tv_sec - time_in_past.tv_sec;
+
+ /* The migration daemon may run a varying numberof usec after the */
+ /* sleep call triggers. A file may be registered in CTR some number */
+ /* of usec X after the daemon started and missed in the subsequent */
+ /* cycle if the daemon starts Y usec after the period in seconds */
+ /* where Y>X. Normalize away this problem by always setting usec */
+ /* to 0. */
+ time_in_past.tv_usec = 0;
+
+ gfdb_brick_info.time_stamp = &time_in_past;
+ gfdb_brick_info._gfdb_promote = is_promotion;
+ gfdb_brick_info._query_cbk_args = query_cbk_args;
+
+ list_for_each_entry(local_brick, args->brick_list, list)
+ {
+ /* Construct query file path for this brick
+ * i.e
+ * /var/run/gluster/xlator_name/
+ * {promote/demote}-brickname-indexinbricklist
+ * So that no two query files will have same path even
+ * bricks have the same name
+ * */
+ snprintf(local_brick->qfile_path, PATH_MAX, "%s-%s-%d",
+ GET_QFILE_PATH(gfdb_brick_info._gfdb_promote),
+ local_brick->brick_name, i);
+
+ /* Delete any old query files for this brick */
+ sys_unlink(local_brick->qfile_path);
+
+ ret = tier_process_brick(local_brick, &gfdb_brick_info);
+ if (ret) {
+ gf_msg(args->this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_BRICK_QUERY_FAILED, "Brick %s query failed\n",
+ local_brick->brick_db_path);
+ }
+ i++;
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+static int
+tier_migrate_files_using_qfile(migration_args_t *comp,
+ query_cbk_args_t *query_cbk_args)
+{
+ int ret = -1;
+ tier_brick_list_t *local_brick = NULL;
+ tier_brick_list_t *temp = NULL;
+ gfdb_time_t current_time = {
+ 0,
+ };
+ ssize_t qfile_array_size = 0;
+ int count = 0;
+ int temp_fd = 0;
+ gf_tier_conf_t *tier_conf = NULL;
+
+ tier_conf = &(query_cbk_args->defrag->tier_conf);
+
+ /* Time for error query files */
+ gettimeofday(&current_time, NULL);
+
+ /* Build the qfile list */
+ list_for_each_entry_safe(local_brick, temp, comp->brick_list, list)
+ {
+ qfile_array_size++;
+ }
+ query_cbk_args->qfile_array = qfile_array_new(qfile_array_size);
+ if (!query_cbk_args->qfile_array) {
+ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to create new "
+ "qfile_array");
+ goto out;
+ }
+
+ /*Open all qfiles*/
+ count = 0;
+ query_cbk_args->qfile_array->exhausted_count = 0;
+ list_for_each_entry_safe(local_brick, temp, comp->brick_list, list)
+ {
+ temp_fd = query_cbk_args->qfile_array->fd_array[count];
+ temp_fd = open(local_brick->qfile_path, O_RDONLY,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+ if (temp_fd < 0) {
+ gf_msg("tier", GF_LOG_ERROR, errno, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to open "
+ "%s to the query file",
+ local_brick->qfile_path);
+ query_cbk_args->qfile_array->exhausted_count++;
+ }
+ query_cbk_args->qfile_array->fd_array[count] = temp_fd;
+ count++;
+ }
+
+ /* Moving the query file index to the next, so that we won't the same
+ * query file every cycle as the first one */
+ query_cbk_args->qfile_array
+ ->next_index = (query_cbk_args->is_promotion)
+ ? tier_conf->last_promote_qfile_index
+ : tier_conf->last_demote_qfile_index;
+ shift_next_index(query_cbk_args->qfile_array);
+ if (query_cbk_args->is_promotion) {
+ tier_conf->last_promote_qfile_index = query_cbk_args->qfile_array
+ ->next_index;
+ } else {
+ tier_conf->last_demote_qfile_index = query_cbk_args->qfile_array
+ ->next_index;
+ }
+
+ /* Migrate files using query file list */
+ ret = tier_migrate_using_query_file((void *)query_cbk_args);
+out:
+ qfile_array_free(query_cbk_args->qfile_array);
+
+ /* If there is an error rename all the query files to .err files
+ * with a timestamp for better debugging */
+ if (ret) {
+ struct tm tm = {
+ 0,
+ };
+ char time_str[128] = {
+ 0,
+ };
+ char query_file_path_err[PATH_MAX] = {
+ 0,
+ };
+ int32_t len = 0;
+
+ /* Time format for error query files */
+ gmtime_r(&current_time.tv_sec, &tm);
+ strftime(time_str, sizeof(time_str), "%F-%T", &tm);
+
+ list_for_each_entry_safe(local_brick, temp, comp->brick_list, list)
+ {
+ /* rename error qfile*/
+ len = snprintf(query_file_path_err, sizeof(query_file_path_err),
+ "%s-%s.err", local_brick->qfile_path, time_str);
+ if ((len >= 0) && (len < sizeof(query_file_path_err))) {
+ if (sys_rename(local_brick->qfile_path, query_file_path_err) ==
+ -1)
+ gf_msg_debug("tier", 0,
+ "rename "
+ "failed");
+ }
+ }
+ }
+
+ query_cbk_args->qfile_array = NULL;
+
+ return ret;
+}
+
+int
+tier_demote(migration_args_t *demotion_args)
+{
+ query_cbk_args_t query_cbk_args;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO("tier", demotion_args, out);
+ GF_VALIDATE_OR_GOTO("tier", demotion_args->this, out);
+ GF_VALIDATE_OR_GOTO(demotion_args->this->name, demotion_args->brick_list,
+ out);
+ GF_VALIDATE_OR_GOTO(demotion_args->this->name, demotion_args->defrag, out);
+
+ THIS = demotion_args->this;
+
+ query_cbk_args.this = demotion_args->this;
+ query_cbk_args.defrag = demotion_args->defrag;
+ query_cbk_args.is_promotion = 0;
+
+ /*Build the query file using bricklist*/
+ ret = tier_build_migration_qfile(demotion_args, &query_cbk_args, _gf_false);
+ if (ret)
+ goto out;
+
+ /* Migrate files using the query file */
+ ret = tier_migrate_files_using_qfile(demotion_args, &query_cbk_args);
+ if (ret)
+ goto out;
+
+out:
+ demotion_args->return_value = ret;
+ return ret;
+}
+
+int
+tier_promote(migration_args_t *promotion_args)
+{
+ int ret = -1;
+ query_cbk_args_t query_cbk_args;
+
+ GF_VALIDATE_OR_GOTO("tier", promotion_args->this, out);
+ GF_VALIDATE_OR_GOTO(promotion_args->this->name, promotion_args->brick_list,
+ out);
+ GF_VALIDATE_OR_GOTO(promotion_args->this->name, promotion_args->defrag,
+ out);
+
+ THIS = promotion_args->this;
+
+ query_cbk_args.this = promotion_args->this;
+ query_cbk_args.defrag = promotion_args->defrag;
+ query_cbk_args.is_promotion = 1;
+
+ /*Build the query file using bricklist*/
+ ret = tier_build_migration_qfile(promotion_args, &query_cbk_args, _gf_true);
+ if (ret)
+ goto out;
+
+ /* Migrate files using the query file */
+ ret = tier_migrate_files_using_qfile(promotion_args, &query_cbk_args);
+ if (ret)
+ goto out;
+
+out:
+ promotion_args->return_value = ret;
+ return ret;
+}
+
+/*
+ * Command the CTR on a brick to compact the local database using an IPC
+ */
+static int
+tier_process_self_compact(tier_brick_list_t *local_brick, void *args)
+{
+ int ret = -1;
+ char *db_path = NULL;
+ query_cbk_args_t *query_cbk_args = NULL;
+ xlator_t *this = NULL;
+ gfdb_conn_node_t *conn_node = NULL;
+ dict_t *params_dict = NULL;
+ dict_t *ctr_ipc_dict = NULL;
+ gfdb_brick_info_t *gfdb_brick_info = args;
+
+ /*Init of all the essentials*/
+ GF_VALIDATE_OR_GOTO("tier", gfdb_brick_info, out);
+ query_cbk_args = gfdb_brick_info->_query_cbk_args;
+
+ GF_VALIDATE_OR_GOTO("tier", query_cbk_args->this, out);
+ this = query_cbk_args->this;
+
+ GF_VALIDATE_OR_GOTO(this->name, gfdb_brick_info->_query_cbk_args, out);
+
+ GF_VALIDATE_OR_GOTO(this->name, local_brick, out);
+
+ GF_VALIDATE_OR_GOTO(this->name, local_brick->xlator, out);
+
+ GF_VALIDATE_OR_GOTO(this->name, local_brick->brick_db_path, out);
+
+ db_path = local_brick->brick_db_path;
+
+ /*Preparing DB parameters before init_db i.e getting db connection*/
+ params_dict = dict_new();
+ if (!params_dict) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "DB Params cannot initialized");
+ goto out;
+ }
+ SET_DB_PARAM_TO_DICT(this->name, params_dict,
+ (char *)gfdb_methods.get_db_path_key(), db_path, ret,
+ out);
+
+ /*Get the db connection*/
+ conn_node = gfdb_methods.init_db((void *)params_dict, dht_tier_db_type);
+ if (!conn_node) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "FATAL: Failed initializing db operations");
+ goto out;
+ }
+
+ ret = 0;
+
+ /*Preparing ctr_ipc_dict*/
+ ctr_ipc_dict = dict_new();
+ if (!ctr_ipc_dict) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "ctr_ipc_dict cannot initialized");
+ goto out;
+ }
+
+ ret = dict_set_int32(ctr_ipc_dict, "compact_active",
+ query_cbk_args->defrag->tier_conf.compact_active);
+
+ if (ret) {
+ gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED,
+ "Failed to set %s "
+ "to params dictionary",
+ "compact_active");
+ goto out;
+ }
+
+ ret = dict_set_int32(
+ ctr_ipc_dict, "compact_mode_switched",
+ query_cbk_args->defrag->tier_conf.compact_mode_switched);
+
+ if (ret) {
+ gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED,
+ "Failed to set %s "
+ "to params dictionary",
+ "compact_mode_switched");
+ goto out;
+ }
+
+ SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_dict, GFDB_IPC_CTR_KEY,
+ GFDB_IPC_CTR_SET_COMPACT_PRAGMA, ret, out);
+
+ gf_msg(this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS,
+ "Starting Compaction IPC");
+
+ ret = syncop_ipc(local_brick->xlator, GF_IPC_TARGET_CTR, ctr_ipc_dict,
+ NULL);
+
+ gf_msg(this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS,
+ "Ending Compaction IPC");
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed compaction "
+ "on db %s error %d",
+ local_brick->brick_db_path, ret);
+ goto out;
+ }
+
+ gf_msg(this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS,
+ "SUCCESS: %s Compaction", local_brick->brick_name);
+
+ ret = 0;
+out:
+ if (params_dict) {
+ dict_unref(params_dict);
+ params_dict = NULL;
+ }
+
+ if (ctr_ipc_dict) {
+ dict_unref(ctr_ipc_dict);
+ ctr_ipc_dict = NULL;
+ }
+
+ gfdb_methods.fini_db(conn_node);
+
+ return ret;
+}
+
+/*
+ * This is the call back function for each brick from hot/cold bricklist.
+ * It determines the database type on each brick and calls the corresponding
+ * function to prepare the compaction IPC.
+ */
+static int
+tier_compact_db_brick(tier_brick_list_t *local_brick, void *args)
+{
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO("tier", local_brick, out);
+
+ GF_VALIDATE_OR_GOTO("tier", local_brick->xlator, out);
+
+ ret = tier_process_self_compact(local_brick, args);
+ if (ret) {
+ gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
+ "Brick %s did not compact", local_brick->brick_name);
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+
+ return ret;
+}
+
+static int
+tier_send_compact(migration_args_t *args, query_cbk_args_t *query_cbk_args)
+{
+ gfdb_time_t current_time;
+ gfdb_brick_info_t gfdb_brick_info;
+ gfdb_time_t time_in_past;
+ int ret = -1;
+ tier_brick_list_t *local_brick = NULL;
+
+ time_in_past.tv_sec = args->freq_time;
+ time_in_past.tv_usec = 0;
+
+ ret = gettimeofday(&current_time, NULL);
+ if (ret == -1) {
+ gf_msg(args->this->name, GF_LOG_ERROR, errno,
+ DHT_MSG_SYS_CALL_GET_TIME_FAILED, "Failed to get current time");
+ goto out;
+ }
+ time_in_past.tv_sec = current_time.tv_sec - time_in_past.tv_sec;
+
+ /* The migration daemon may run a varying numberof usec after the sleep
+ call triggers. A file may be registered in CTR some number of usec X
+ after the daemon started and missed in the subsequent cycle if the
+ daemon starts Y usec after the period in seconds where Y>X. Normalize
+ away this problem by always setting usec to 0. */
+ time_in_past.tv_usec = 0;
+
+ gfdb_brick_info.time_stamp = &time_in_past;
+
+ /* This is meant to say we are always compacting at this point */
+ /* We simply borrow the promotion flag to do this */
+ gfdb_brick_info._gfdb_promote = 1;
+
+ gfdb_brick_info._query_cbk_args = query_cbk_args;
+
+ list_for_each_entry(local_brick, args->brick_list, list)
+ {
+ gf_msg(args->this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS,
+ "Start compaction for %s", local_brick->brick_name);
+
+ ret = tier_compact_db_brick(local_brick, &gfdb_brick_info);
+ if (ret) {
+ gf_msg(args->this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_BRICK_QUERY_FAILED, "Brick %s compaction failed\n",
+ local_brick->brick_db_path);
+ }
+
+ gf_msg(args->this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS,
+ "End compaction for %s", local_brick->brick_name);
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+static int
+tier_compact(void *args)
+{
+ int ret = -1;
+ query_cbk_args_t query_cbk_args;
+ migration_args_t *compaction_args = args;
+
+ GF_VALIDATE_OR_GOTO("tier", compaction_args->this, out);
+ GF_VALIDATE_OR_GOTO(compaction_args->this->name,
+ compaction_args->brick_list, out);
+ GF_VALIDATE_OR_GOTO(compaction_args->this->name, compaction_args->defrag,
+ out);
+
+ THIS = compaction_args->this;
+
+ query_cbk_args.this = compaction_args->this;
+ query_cbk_args.defrag = compaction_args->defrag;
+ query_cbk_args.is_compaction = 1;
+
+ /* Send the compaction pragma out to all the bricks on the bricklist. */
+ /* tier_get_bricklist ensures all bricks on the list are local to */
+ /* this node. */
+ ret = tier_send_compact(compaction_args, &query_cbk_args);
+ if (ret)
+ goto out;
+
+ ret = 0;
+out:
+ compaction_args->return_value = ret;
+ return ret;
+}
+
+static int
+tier_get_bricklist(xlator_t *xl, struct list_head *local_bricklist_head)
+{
+ xlator_list_t *child = NULL;
+ char *rv = NULL;
+ char *rh = NULL;
+ char *brickname = NULL;
+ char db_name[PATH_MAX] = "";
+ int ret = 0;
+ tier_brick_list_t *local_brick = NULL;
+ int32_t len = 0;
+
+ GF_VALIDATE_OR_GOTO("tier", xl, out);
+ GF_VALIDATE_OR_GOTO("tier", local_bricklist_head, out);
+
+ /*
+ * This function obtains remote subvolumes and filters out only
+ * those running on the same node as the tier daemon.
+ */
+ if (strcmp(xl->type, "protocol/client") == 0) {
+ ret = dict_get_str(xl->options, "remote-host", &rh);
+ if (ret < 0)
+ goto out;
+
+ if (gf_is_local_addr(rh)) {
+ local_brick = GF_CALLOC(1, sizeof(tier_brick_list_t),
+ gf_tier_mt_bricklist_t);
+ if (!local_brick) {
+ goto out;
+ }
+
+ ret = dict_get_str(xl->options, "remote-subvolume", &rv);
+ if (ret < 0)
+ goto out;
+
+ brickname = strrchr(rv, '/') + 1;
+ snprintf(db_name, sizeof(db_name), "%s.db", brickname);
+
+ local_brick->brick_db_path = GF_MALLOC(PATH_MAX, gf_common_mt_char);
+ if (!local_brick->brick_db_path) {
+ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS,
+ "Failed to allocate memory for"
+ " bricklist.");
+ ret = -1;
+ goto out;
+ }
+
+ len = snprintf(local_brick->brick_db_path, PATH_MAX, "%s/%s/%s", rv,
+ GF_HIDDEN_PATH, db_name);
+ if ((len < 0) || (len >= PATH_MAX)) {
+ gf_msg("tier", GF_LOG_ERROR, EINVAL, DHT_MSG_LOG_TIER_STATUS,
+ "DB path too long");
+ ret = -1;
+ goto out;
+ }
+
+ local_brick->xlator = xl;
+
+ snprintf(local_brick->brick_name, NAME_MAX, "%s", brickname);
+
+ list_add_tail(&(local_brick->list), local_bricklist_head);
+
+ ret = 0;
+ goto out;
+ }
+ }
+
+ for (child = xl->children; child; child = child->next) {
+ ret = tier_get_bricklist(child->xlator, local_bricklist_head);
+ if (ret) {
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+
+ if (ret) {
+ if (local_brick) {
+ GF_FREE(local_brick->brick_db_path);
+ }
+ GF_FREE(local_brick);
+ }
+
+ return ret;
+}
+
+int
+tier_get_freq_demote(gf_tier_conf_t *tier_conf)
+{
+ if ((tier_conf->mode == TIER_MODE_WM) &&
+ (tier_conf->watermark_last == TIER_WM_HI))
+ return DEFAULT_DEMOTE_DEGRADED;
+ else
+ return tier_conf->tier_demote_frequency;
+}
+
+int
+tier_get_freq_promote(gf_tier_conf_t *tier_conf)
+{
+ return tier_conf->tier_promote_frequency;
+}
+
+int
+tier_get_freq_compact_hot(gf_tier_conf_t *tier_conf)
+{
+ return tier_conf->tier_compact_hot_frequency;
+}
+
+int
+tier_get_freq_compact_cold(gf_tier_conf_t *tier_conf)
+{
+ return tier_conf->tier_compact_cold_frequency;
+}
+
+static int
+tier_check_demote(gfdb_time_t current_time, int freq)
+{
+ return ((current_time.tv_sec % freq) == 0) ? _gf_true : _gf_false;
+}
+
+static gf_boolean_t
+tier_check_promote(gf_tier_conf_t *tier_conf, gfdb_time_t current_time,
+ int freq)
+{
+ if ((tier_conf->mode == TIER_MODE_WM) &&
+ (tier_conf->watermark_last == TIER_WM_HI))
+ return _gf_false;
+
+ else
+ return ((current_time.tv_sec % freq) == 0) ? _gf_true : _gf_false;
+}
+
+static gf_boolean_t
+tier_check_compact(gf_tier_conf_t *tier_conf, gfdb_time_t current_time,
+ int freq_compact)
+{
+ if (!(tier_conf->compact_active || tier_conf->compact_mode_switched))
+ return _gf_false;
+
+ return ((current_time.tv_sec % freq_compact) == 0) ? _gf_true : _gf_false;
+}
+
+void
+clear_bricklist(struct list_head *brick_list)
+{
+ tier_brick_list_t *local_brick = NULL;
+ tier_brick_list_t *temp = NULL;
+
+ if (list_empty(brick_list)) {
+ return;
+ }
+
+ list_for_each_entry_safe(local_brick, temp, brick_list, list)
+ {
+ list_del(&local_brick->list);
+ GF_FREE(local_brick->brick_db_path);
+ GF_FREE(local_brick);
+ }
+}
+
+static void
+set_brick_list_qpath(struct list_head *brick_list, gf_boolean_t is_cold)
+{
+ tier_brick_list_t *local_brick = NULL;
+ int i = 0;
+
+ GF_VALIDATE_OR_GOTO("tier", brick_list, out);
+
+ list_for_each_entry(local_brick, brick_list, list)
+ {
+ /* Construct query file path for this brick
+ * i.e
+ * /var/run/gluster/xlator_name/
+ * {promote/demote}-brickname-indexinbricklist
+ * So that no two query files will have same path even
+ * bricks have the same name
+ * */
+ snprintf(local_brick->qfile_path, PATH_MAX, "%s-%s-%d",
+ GET_QFILE_PATH(is_cold), local_brick->brick_name, i);
+ i++;
+ }
+out:
+ return;
+}
+
+static int
+tier_prepare_compact(migration_args_t *args, gfdb_time_t current_time)
+{
+ xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ gf_defrag_info_t *defrag = NULL;
+ gf_tier_conf_t *tier_conf = NULL;
+ gf_boolean_t is_hot_tier = args->is_hot_tier;
+ int freq = 0;
+ int ret = -1;
+ const char *tier_type = is_hot_tier ? "hot" : "cold";
+
+ this = args->this;
+
+ conf = this->private;
+
+ defrag = conf->defrag;
+
+ tier_conf = &defrag->tier_conf;
+
+ freq = is_hot_tier ? tier_get_freq_compact_hot(tier_conf)
+ : tier_get_freq_compact_cold(tier_conf);
+
+ defrag->tier_conf.compact_mode_switched =
+ is_hot_tier ? defrag->tier_conf.compact_mode_switched_hot
+ : defrag->tier_conf.compact_mode_switched_cold;
+
+ gf_msg(this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS,
+ "Compact mode %i", defrag->tier_conf.compact_mode_switched);
+
+ if (tier_check_compact(tier_conf, current_time, freq)) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
+ "Start compaction on %s tier", tier_type);
+
+ args->freq_time = freq;
+ ret = tier_compact(args);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Compaction failed on "
+ "%s tier",
+ tier_type);
+ goto out;
+ }
+
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
+ "End compaction on %s tier", tier_type);
+
+ if (is_hot_tier) {
+ defrag->tier_conf.compact_mode_switched_hot = _gf_false;
+ } else {
+ defrag->tier_conf.compact_mode_switched_cold = _gf_false;
+ }
+ }
+
+out:
+ return ret;
+}
+
+static int
+tier_get_wm_interval(tier_mode_t mode, tier_watermark_op_t wm)
+{
+ if (mode == TIER_MODE_WM && wm == TIER_WM_HI)
+ return WM_INTERVAL_EMERG;
+
+ return WM_INTERVAL;
+}
+
+/*
+ * Main tiering loop. This is called from the promotion and the
+ * demotion threads spawned in tier_start().
+ *
+ * Every second, wake from sleep to perform tasks.
+ * 1. Check trigger to migrate data.
+ * 2. Check for state changes (pause, unpause, stop).
+ */
+static void *
+tier_run(void *in_args)
+{
+ dht_conf_t *conf = NULL;
+ gfdb_time_t current_time = {0};
+ int freq = 0;
+ int ret = 0;
+ xlator_t *any = NULL;
+ xlator_t *xlator = NULL;
+ gf_tier_conf_t *tier_conf = NULL;
+ loc_t root_loc = {0};
+ int check_watermark = 0;
+ gf_defrag_info_t *defrag = NULL;
+ xlator_t *this = NULL;
+ migration_args_t *args = in_args;
+ GF_VALIDATE_OR_GOTO("tier", args, out);
+ GF_VALIDATE_OR_GOTO("tier", args->brick_list, out);
+
+ this = args->this;
+ GF_VALIDATE_OR_GOTO("tier", this, out);
+
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO("tier", conf, out);
+
+ defrag = conf->defrag;
+ GF_VALIDATE_OR_GOTO("tier", defrag, out);
+
+ if (list_empty(args->brick_list)) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Brick list for tier is empty. Exiting.");
+ goto out;
+ }
+
+ defrag->defrag_status = GF_DEFRAG_STATUS_STARTED;
+ tier_conf = &defrag->tier_conf;
+
+ dht_build_root_loc(defrag->root_inode, &root_loc);
+
+ while (1) {
+ /*
+ * Check if a graph switch occurred. If so, stop migration
+ * thread. It will need to be restarted manually.
+ */
+ any = THIS->ctx->active->first;
+ xlator = xlator_search_by_name(any, this->name);
+
+ if (xlator != this) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
+ "Detected graph switch. Exiting migration "
+ "daemon.");
+ goto out;
+ }
+
+ gf_defrag_check_pause_tier(tier_conf);
+
+ sleep(1);
+
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+ ret = 1;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "defrag->defrag_status != "
+ "GF_DEFRAG_STATUS_STARTED");
+ goto out;
+ }
+
+ if (defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER ||
+ defrag->cmd == GF_DEFRAG_CMD_DETACH_START) {
+ ret = 0;
+ defrag->defrag_status = GF_DEFRAG_STATUS_COMPLETE;
+ gf_msg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_LOG_TIER_ERROR,
+ "defrag->defrag_cmd == "
+ "GF_DEFRAG_CMD_START_DETACH_TIER");
+ goto out;
+ }
+
+ if (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING)
+ continue;
+
+ /* To have proper synchronization amongst all
+ * brick holding nodes, so that promotion and demotions
+ * start atomically w.r.t promotion/demotion frequency
+ * period, all nodes should have their system time
+ * in-sync with each other either manually set or
+ * using a NTP server*/
+ ret = gettimeofday(&current_time, NULL);
+ if (ret == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, errno,
+ DHT_MSG_SYS_CALL_GET_TIME_FAILED,
+ "Failed to get current time");
+ goto out;
+ }
+
+ check_watermark++;
+
+ /* emergency demotion requires frequent watermark monitoring */
+ if (check_watermark >=
+ tier_get_wm_interval(tier_conf->mode, tier_conf->watermark_last)) {
+ check_watermark = 0;
+ if (tier_conf->mode == TIER_MODE_WM) {
+ ret = tier_get_fs_stat(this, &root_loc);
+ if (ret != 0) {
+ continue;
+ }
+ ret = tier_check_watermark(this);
+ if (ret != 0) {
+ gf_msg(this->name, GF_LOG_CRITICAL, errno,
+ DHT_MSG_LOG_TIER_ERROR, "Failed to get watermark");
+ continue;
+ }
+ }
+ }
+
+ if (args->is_promotion) {
+ freq = tier_get_freq_promote(tier_conf);
+
+ if (tier_check_promote(tier_conf, current_time, freq)) {
+ args->freq_time = freq;
+ ret = tier_promote(args);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Promotion failed");
+ }
+ }
+ } else if (args->is_compaction) {
+ tier_prepare_compact(args, current_time);
+ } else {
+ freq = tier_get_freq_demote(tier_conf);
+
+ if (tier_check_demote(current_time, freq)) {
+ args->freq_time = freq;
+ ret = tier_demote(args);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Demotion failed");
+ }
+ }
+ }
+
+ /* Check the statfs immediately after the processing threads
+ return */
+ check_watermark = WM_INTERVAL;
+ }
+
+ ret = 0;
+out:
+
+ args->return_value = ret;
+
+ return NULL;
+}
+
+int
+tier_start(xlator_t *this, gf_defrag_info_t *defrag)
+{
+ pthread_t promote_thread;
+ pthread_t demote_thread;
+ pthread_t hot_compact_thread;
+ pthread_t cold_compact_thread;
+ int ret = -1;
+ struct list_head bricklist_hot = {0};
+ struct list_head bricklist_cold = {0};
+ migration_args_t promotion_args = {0};
+ migration_args_t demotion_args = {0};
+ migration_args_t hot_compaction_args = {0};
+ migration_args_t cold_compaction_args = {0};
+ dht_conf_t *conf = NULL;
+
+ INIT_LIST_HEAD((&bricklist_hot));
+ INIT_LIST_HEAD((&bricklist_cold));
+
+ conf = this->private;
+
+ tier_get_bricklist(conf->subvolumes[1], &bricklist_hot);
+ set_brick_list_qpath(&bricklist_hot, _gf_false);
+
+ demotion_args.this = this;
+ demotion_args.brick_list = &bricklist_hot;
+ demotion_args.defrag = defrag;
+ demotion_args.is_promotion = _gf_false;
+ demotion_args.is_compaction = _gf_false;
+
+ ret = gf_thread_create(&demote_thread, NULL, &tier_run, &demotion_args,
+ "tierdem");
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to start demotion thread.");
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ goto cleanup;
+ }
+
+ tier_get_bricklist(conf->subvolumes[0], &bricklist_cold);
+ set_brick_list_qpath(&bricklist_cold, _gf_true);
+
+ promotion_args.this = this;
+ promotion_args.brick_list = &bricklist_cold;
+ promotion_args.defrag = defrag;
+ promotion_args.is_promotion = _gf_true;
+
+ ret = gf_thread_create(&promote_thread, NULL, &tier_run, &promotion_args,
+ "tierpro");
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to start promotion thread.");
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ goto waitforspawned;
+ }
+
+ hot_compaction_args.this = this;
+ hot_compaction_args.brick_list = &bricklist_hot;
+ hot_compaction_args.defrag = defrag;
+ hot_compaction_args.is_promotion = _gf_false;
+ hot_compaction_args.is_compaction = _gf_true;
+ hot_compaction_args.is_hot_tier = _gf_true;
+
+ ret = gf_thread_create(&hot_compact_thread, NULL, &tier_run,
+ &hot_compaction_args, "tierhcom");
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to start compaction thread.");
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ goto waitforspawnedpromote;
+ }
+
+ cold_compaction_args.this = this;
+ cold_compaction_args.brick_list = &bricklist_cold;
+ cold_compaction_args.defrag = defrag;
+ cold_compaction_args.is_promotion = _gf_false;
+ cold_compaction_args.is_compaction = _gf_true;
+ cold_compaction_args.is_hot_tier = _gf_false;
+
+ ret = gf_thread_create(&cold_compact_thread, NULL, &tier_run,
+ &cold_compaction_args, "tierccom");
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to start compaction thread.");
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ goto waitforspawnedhotcompact;
+ }
+ pthread_join(cold_compact_thread, NULL);
+
+waitforspawnedhotcompact:
+ pthread_join(hot_compact_thread, NULL);
+
+waitforspawnedpromote:
+ pthread_join(promote_thread, NULL);
+
+waitforspawned:
+ pthread_join(demote_thread, NULL);
+
+cleanup:
+ clear_bricklist(&bricklist_cold);
+ clear_bricklist(&bricklist_hot);
+ return ret;
+}
+
+int32_t
+tier_migration_needed(xlator_t *this)
+{
+ gf_defrag_info_t *defrag = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = 0;
+
+ conf = this->private;
+
+ GF_VALIDATE_OR_GOTO(this->name, conf, out);
+ GF_VALIDATE_OR_GOTO(this->name, conf->defrag, out);
+
+ defrag = conf->defrag;
+
+ if ((defrag->cmd == GF_DEFRAG_CMD_START_TIER) ||
+ (defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER))
+ ret = 1;
+out:
+ return ret;
+}
+
+int32_t
+tier_migration_get_dst(xlator_t *this, dht_local_t *local)
+{
+ dht_conf_t *conf = NULL;
+ int32_t ret = -1;
+ gf_defrag_info_t *defrag = NULL;
+
+ GF_VALIDATE_OR_GOTO("tier", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ conf = this->private;
+
+ defrag = conf->defrag;
+
+ if (defrag && defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER) {
+ local->rebalance.target_node = conf->subvolumes[0];
+
+ } else if (conf->subvolumes[0] == local->cached_subvol)
+ local->rebalance.target_node = conf->subvolumes[1];
+ else
+ local->rebalance.target_node = conf->subvolumes[0];
+
+ if (local->rebalance.target_node)
+ ret = 0;
+
+out:
+ return ret;
+}
+
+xlator_t *
+tier_search(xlator_t *this, dht_layout_t *layout, const char *name)
+{
+ xlator_t *subvol = NULL;
+ dht_conf_t *conf = NULL;
+
+ GF_VALIDATE_OR_GOTO("tier", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ conf = this->private;
+
+ subvol = TIER_HASHED_SUBVOL;
+
+out:
+ return subvol;
+}
+
+static int
+tier_load_externals(xlator_t *this)
+{
+ int ret = -1;
+ char *libpathfull = (LIBDIR "/libgfdb.so.0");
+ get_gfdb_methods_t get_gfdb_methods;
+
+ GF_VALIDATE_OR_GOTO("this", this, out);
+
+ libhandle = dlopen(libpathfull, RTLD_NOW);
+ if (!libhandle) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Error loading libgfdb.so %s\n", dlerror());
+ ret = -1;
+ goto out;
+ }
+
+ get_gfdb_methods = dlsym(libhandle, "get_gfdb_methods");
+ if (!get_gfdb_methods) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Error loading get_gfdb_methods()");
+ ret = -1;
+ goto out;
+ }
+
+ get_gfdb_methods(&gfdb_methods);
+
+ ret = 0;
+
+out:
+ if (ret && libhandle)
+ dlclose(libhandle);
+
+ return ret;
+}
+
+static tier_mode_t
+tier_validate_mode(char *mode)
+{
+ int ret = -1;
+
+ if (strcmp(mode, "test") == 0) {
+ ret = TIER_MODE_TEST;
+ } else {
+ ret = TIER_MODE_WM;
+ }
+
+ return ret;
+}
+
+static gf_boolean_t
+tier_validate_compact_mode(char *mode)
+{
+ gf_boolean_t ret = _gf_false;
+
+ gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
+ "tier_validate_compact_mode: mode = %s", mode);
+
+ if (!strcmp(mode, "on")) {
+ ret = _gf_true;
+ } else {
+ ret = _gf_false;
+ }
+
+ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS,
+ "tier_validate_compact_mode: ret = %i", ret);
+
+ return ret;
+}
+
+int
+tier_init_methods(xlator_t *this)
+{
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
+
+ GF_VALIDATE_OR_GOTO("tier", this, err);
+
+ conf = this->private;
+
+ methods = &(conf->methods);
+
+ methods->migration_get_dst_subvol = tier_migration_get_dst;
+ methods->migration_other = tier_start;
+ methods->migration_needed = tier_migration_needed;
+ methods->layout_search = tier_search;
+
+ ret = 0;
+err:
+ return ret;
+}
+
+static void
+tier_save_vol_name(xlator_t *this)
+{
+ dht_conf_t *conf = NULL;
+ gf_defrag_info_t *defrag = NULL;
+ char *suffix = NULL;
+ int name_len = 0;
+
+ conf = this->private;
+ defrag = conf->defrag;
+
+ suffix = strstr(this->name, "-tier-dht");
+
+ if (suffix)
+ name_len = suffix - this->name;
+ else
+ name_len = strlen(this->name);
+
+ if (name_len > GD_VOLUME_NAME_MAX)
+ name_len = GD_VOLUME_NAME_MAX;
+
+ strncpy(defrag->tier_conf.volname, this->name, name_len);
+ defrag->tier_conf.volname[name_len] = 0;
+}
+
+int
+tier_init(xlator_t *this)
+{
+ int ret = -1;
+ int freq = 0;
+ int maxsize = 0;
+ dht_conf_t *conf = NULL;
+ gf_defrag_info_t *defrag = NULL;
+ char *voldir = NULL;
+ char *mode = NULL;
+ char *paused = NULL;
+ tier_mode_t tier_mode = DEFAULT_TIER_MODE;
+ gf_boolean_t compact_mode = _gf_false;
+
+ ret = dht_init(this);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "tier_init failed");
+ goto out;
+ }
+
+ conf = this->private;
+
+ ret = tier_init_methods(this);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "tier_init_methods failed");
+ goto out;
+ }
+
+ if (conf->subvolume_cnt != 2) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Invalid number of subvolumes %d", conf->subvolume_cnt);
+ goto out;
+ }
+
+ /* if instatiated from client side initialization is complete. */
+ if (!conf->defrag) {
+ ret = 0;
+ goto out;
+ }
+
+ /* if instatiated from server side, load db libraries */
+ ret = tier_load_externals(this);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Could not load externals. Aborting");
+ goto out;
+ }
+
+ defrag = conf->defrag;
+
+ defrag->tier_conf.last_demote_qfile_index = 0;
+ defrag->tier_conf.last_promote_qfile_index = 0;
+
+ defrag->tier_conf.is_tier = 1;
+ defrag->this = this;
+
+ ret = dict_get_int32(this->options, "tier-max-promote-file-size", &maxsize);
+ if (ret) {
+ maxsize = 0;
+ }
+
+ defrag->tier_conf.tier_max_promote_size = maxsize;
+
+ ret = dict_get_int32(this->options, "tier-promote-frequency", &freq);
+ if (ret) {
+ freq = DEFAULT_PROMOTE_FREQ_SEC;
+ }
+
+ defrag->tier_conf.tier_promote_frequency = freq;
+
+ ret = dict_get_int32(this->options, "tier-demote-frequency", &freq);
+ if (ret) {
+ freq = DEFAULT_DEMOTE_FREQ_SEC;
+ }
+
+ defrag->tier_conf.tier_demote_frequency = freq;
+
+ ret = dict_get_int32(this->options, "tier-hot-compact-frequency", &freq);
+ if (ret) {
+ freq = DEFAULT_HOT_COMPACT_FREQ_SEC;
+ }
+
+ defrag->tier_conf.tier_compact_hot_frequency = freq;
+
+ ret = dict_get_int32(this->options, "tier-cold-compact-frequency", &freq);
+ if (ret) {
+ freq = DEFAULT_COLD_COMPACT_FREQ_SEC;
+ }
+
+ defrag->tier_conf.tier_compact_cold_frequency = freq;
+
+ ret = dict_get_int32(this->options, "watermark-hi", &freq);
+ if (ret) {
+ freq = DEFAULT_WM_HI;
+ }
+
+ defrag->tier_conf.watermark_hi = freq;
+
+ ret = dict_get_int32(this->options, "watermark-low", &freq);
+ if (ret) {
+ freq = DEFAULT_WM_LOW;
+ }
+
+ defrag->tier_conf.watermark_low = freq;
+
+ ret = dict_get_int32(this->options, "write-freq-threshold", &freq);
+ if (ret) {
+ freq = DEFAULT_WRITE_FREQ_SEC;
+ }
+
+ defrag->write_freq_threshold = freq;
+
+ ret = dict_get_int32(this->options, "read-freq-threshold", &freq);
+ if (ret) {
+ freq = DEFAULT_READ_FREQ_SEC;
+ }
+
+ defrag->read_freq_threshold = freq;
+
+ ret = dict_get_int32(this->options, "tier-max-mb", &freq);
+ if (ret) {
+ freq = DEFAULT_TIER_MAX_MIGRATE_MB;
+ }
+
+ defrag->tier_conf.max_migrate_bytes = (uint64_t)freq * 1024 * 1024;
+
+ ret = dict_get_int32(this->options, "tier-max-files", &freq);
+ if (ret) {
+ freq = DEFAULT_TIER_MAX_MIGRATE_FILES;
+ }
+
+ defrag->tier_conf.max_migrate_files = freq;
+
+ ret = dict_get_int32(this->options, "tier-query-limit",
+ &(defrag->tier_conf.query_limit));
+ if (ret) {
+ defrag->tier_conf.query_limit = DEFAULT_TIER_QUERY_LIMIT;
+ }
+
+ ret = dict_get_str(this->options, "tier-compact", &mode);
+
+ if (ret) {
+ defrag->tier_conf.compact_active = DEFAULT_COMP_MODE;
+ } else {
+ compact_mode = tier_validate_compact_mode(mode);
+ /* If compaction is now active, we need to inform the bricks on
+ the hot and cold tier of this. See dht-common.h for more. */
+ defrag->tier_conf.compact_active = compact_mode;
+ if (compact_mode) {
+ defrag->tier_conf.compact_mode_switched_hot = _gf_true;
+ defrag->tier_conf.compact_mode_switched_cold = _gf_true;
+ }
+ }
+
+ ret = dict_get_str(this->options, "tier-mode", &mode);
+ if (ret) {
+ defrag->tier_conf.mode = DEFAULT_TIER_MODE;
+ } else {
+ tier_mode = tier_validate_mode(mode);
+ defrag->tier_conf.mode = tier_mode;
+ }
+
+ pthread_mutex_init(&defrag->tier_conf.pause_mutex, 0);
+
+ gf_defrag_set_pause_state(&defrag->tier_conf, TIER_RUNNING);
+
+ ret = dict_get_str(this->options, "tier-pause", &paused);
+
+ if (paused && strcmp(paused, "on") == 0)
+ gf_defrag_set_pause_state(&defrag->tier_conf, TIER_REQUEST_PAUSE);
+
+ ret = gf_asprintf(&voldir, "%s/%s", DEFAULT_VAR_RUN_DIRECTORY, this->name);
+ if (ret < 0)
+ goto out;
+
+ ret = mkdir_p(voldir, 0777, _gf_true);
+ if (ret == -1 && errno != EEXIST) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "tier_init failed");
+
+ GF_FREE(voldir);
+ goto out;
+ }
+
+ GF_FREE(voldir);
+
+ ret = gf_asprintf(&promotion_qfile, "%s/%s/promote",
+ DEFAULT_VAR_RUN_DIRECTORY, this->name);
+ if (ret < 0)
+ goto out;
+
+ ret = gf_asprintf(&demotion_qfile, "%s/%s/demote",
+ DEFAULT_VAR_RUN_DIRECTORY, this->name);
+ if (ret < 0) {
+ GF_FREE(promotion_qfile);
+ goto out;
+ }
+
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
+ "Promote/demote frequency %d/%d "
+ "Write/Read freq thresholds %d/%d",
+ defrag->tier_conf.tier_promote_frequency,
+ defrag->tier_conf.tier_demote_frequency,
+ defrag->write_freq_threshold, defrag->read_freq_threshold);
+
+ tier_save_vol_name(this);
+
+ ret = 0;
+
+out:
+
+ return ret;
+}
+
+int
+tier_cli_pause_done(int op_ret, call_frame_t *sync_frame, void *data)
+{
+ gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_TIER_PAUSED,
+ "Migrate file paused with op_ret %d", op_ret);
+
+ return op_ret;
+}
+
+int
+tier_cli_pause(void *data)
+{
+ gf_defrag_info_t *defrag = NULL;
+ xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+
+ this = data;
+
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, exit);
+
+ defrag = conf->defrag;
+ GF_VALIDATE_OR_GOTO(this->name, defrag, exit);
+
+ gf_defrag_pause_tier(this, defrag);
+
+ ret = 0;
+exit:
+ return ret;
+}
+
+int
+tier_reconfigure(xlator_t *this, dict_t *options)
+{
+ dht_conf_t *conf = NULL;
+ gf_defrag_info_t *defrag = NULL;
+ char *mode = NULL;
+ int migrate_mb = 0;
+ gf_boolean_t req_pause = _gf_false;
+ int ret = 0;
+ call_frame_t *frame = NULL;
+ gf_boolean_t last_compact_setting = _gf_false;
+
+ conf = this->private;
+
+ if (conf->defrag) {
+ defrag = conf->defrag;
+ GF_OPTION_RECONF("tier-max-promote-file-size",
+ defrag->tier_conf.tier_max_promote_size, options,
+ int32, out);
+
+ GF_OPTION_RECONF("tier-promote-frequency",
+ defrag->tier_conf.tier_promote_frequency, options,
+ int32, out);
+
+ GF_OPTION_RECONF("tier-demote-frequency",
+ defrag->tier_conf.tier_demote_frequency, options,
+ int32, out);
+
+ GF_OPTION_RECONF("write-freq-threshold", defrag->write_freq_threshold,
+ options, int32, out);
+
+ GF_OPTION_RECONF("read-freq-threshold", defrag->read_freq_threshold,
+ options, int32, out);
+
+ GF_OPTION_RECONF("watermark-hi", defrag->tier_conf.watermark_hi,
+ options, int32, out);
+
+ GF_OPTION_RECONF("watermark-low", defrag->tier_conf.watermark_low,
+ options, int32, out);
+
+ last_compact_setting = defrag->tier_conf.compact_active;
+
+ GF_OPTION_RECONF("tier-compact", defrag->tier_conf.compact_active,
+ options, bool, out);
+
+ if (last_compact_setting != defrag->tier_conf.compact_active) {
+ defrag->tier_conf.compact_mode_switched_hot = _gf_true;
+ defrag->tier_conf.compact_mode_switched_cold = _gf_true;
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
+ "compact mode switched");
+ }
+
+ GF_OPTION_RECONF("tier-hot-compact-frequency",
+ defrag->tier_conf.tier_compact_hot_frequency, options,
+ int32, out);
+
+ GF_OPTION_RECONF("tier-cold-compact-frequency",
+ defrag->tier_conf.tier_compact_cold_frequency, options,
+ int32, out);
+
+ GF_OPTION_RECONF("tier-mode", mode, options, str, out);
+ defrag->tier_conf.mode = tier_validate_mode(mode);
+
+ GF_OPTION_RECONF("tier-max-mb", migrate_mb, options, int32, out);
+ defrag->tier_conf.max_migrate_bytes = (uint64_t)migrate_mb * 1024 *
+ 1024;
+
+ GF_OPTION_RECONF("tier-max-files", defrag->tier_conf.max_migrate_files,
+ options, int32, out);
+
+ GF_OPTION_RECONF("tier-query-limit", defrag->tier_conf.query_limit,
+ options, int32, out);
+
+ GF_OPTION_RECONF("tier-pause", req_pause, options, bool, out);
+
+ if (req_pause == _gf_true) {
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame)
+ goto out;
+
+ frame->root->pid = GF_CLIENT_PID_DEFRAG;
+
+ ret = synctask_new(this->ctx->env, tier_cli_pause,
+ tier_cli_pause_done, frame, this);
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "pause tier failed on reconfigure");
+ }
+ } else {
+ ret = gf_defrag_resume_tier(this, defrag);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "resume tier failed on reconfigure");
+ }
+ }
+ }
+
+out:
+ return dht_reconfigure(this, options);
+}
+
+void
+tier_fini(xlator_t *this)
+{
+ if (libhandle)
+ dlclose(libhandle);
+
+ GF_FREE(demotion_qfile);
+ GF_FREE(promotion_qfile);
+
+ dht_fini(this);
+}
+
+struct xlator_fops fops = {
+
+ .lookup = dht_lookup,
+ .create = tier_create,
+ .mknod = dht_mknod,
+
+ .open = dht_open,
+ .statfs = tier_statfs,
+ .opendir = dht_opendir,
+ .readdir = tier_readdir,
+ .readdirp = tier_readdirp,
+ .fsyncdir = dht_fsyncdir,
+ .symlink = dht_symlink,
+ .unlink = tier_unlink,
+ .link = tier_link,
+ .mkdir = dht_mkdir,
+ .rmdir = dht_rmdir,
+ .rename = dht_rename,
+ .entrylk = dht_entrylk,
+ .fentrylk = dht_fentrylk,
+
+ /* Inode read operations */
+ .stat = dht_stat,
+ .fstat = dht_fstat,
+ .access = dht_access,
+ .readlink = dht_readlink,
+ .getxattr = dht_getxattr,
+ .fgetxattr = dht_fgetxattr,
+ .readv = dht_readv,
+ .flush = dht_flush,
+ .fsync = dht_fsync,
+ .inodelk = dht_inodelk,
+ .finodelk = dht_finodelk,
+ .lk = dht_lk,
+
+ /* Inode write operations */
+ .fremovexattr = dht_fremovexattr,
+ .removexattr = dht_removexattr,
+ .setxattr = dht_setxattr,
+ .fsetxattr = dht_fsetxattr,
+ .truncate = dht_truncate,
+ .ftruncate = dht_ftruncate,
+ .writev = dht_writev,
+ .xattrop = dht_xattrop,
+ .fxattrop = dht_fxattrop,
+ .setattr = dht_setattr,
+ .fsetattr = dht_fsetattr,
+ .fallocate = dht_fallocate,
+ .discard = dht_discard,
+ .zerofill = dht_zerofill,
+};
+
+struct xlator_cbks cbks = {.release = dht_release, .forget = dht_forget};
+
+extern int32_t
+mem_acct_init(xlator_t *this);
+
+extern struct volume_options dht_options[];
+
+xlator_api_t xlator_api = {
+ .init = tier_init,
+ .fini = tier_fini,
+ .notify = dht_notify,
+ .reconfigure = tier_reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {GD_OP_VERSION_3_7_0}, /* Present from the initial version */
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = dht_options,
+ .identifier = "tier",
+ .category = GF_MAINTAINED,
+};
+
diff --git a/xlators/cluster/dht/src/tier.h b/xlators/cluster/dht/src/tier.h
new file mode 100644
index 0000000..a20b1db
--- /dev/null
+++ b/xlators/cluster/dht/src/tier.h
@@ -0,0 +1,110 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _TIER_H_
+#define _TIER_H_
+
+/******************************************************************************/
+/* This is from dht-rebalancer.c as we don't have dht-rebalancer.h */
+#include "dht-common.h"
+#include <glusterfs/xlator.h>
+#include <signal.h>
+#include <fnmatch.h>
+#include <signal.h>
+
+/*
+ * Size of timer wheel. We would not promote or demote less
+ * frequently than this number.
+ */
+#define TIMER_SECS 3600
+
+#include "gfdb_data_store.h"
+#include <ctype.h>
+#include <sys/stat.h>
+
+#define PROMOTION_QFILE "promotequeryfile"
+#define DEMOTION_QFILE "demotequeryfile"
+
+#define TIER_HASHED_SUBVOL conf->subvolumes[0]
+#define TIER_UNHASHED_SUBVOL conf->subvolumes[1]
+
+#define GET_QFILE_PATH(is_promotion) \
+ (is_promotion) ? promotion_qfile : demotion_qfile
+
+typedef struct tier_qfile_array {
+ int *fd_array;
+ ssize_t array_size;
+ ssize_t next_index;
+ /* Indicate the number of exhuasted FDs*/
+ ssize_t exhausted_count;
+} tier_qfile_array_t;
+
+typedef struct _query_cbk_args {
+ xlator_t *this;
+ gf_defrag_info_t *defrag;
+ /* This is write */
+ int query_fd;
+ int is_promotion;
+ int is_compaction;
+ /* This is for read */
+ tier_qfile_array_t *qfile_array;
+} query_cbk_args_t;
+
+int
+gf_run_tier(xlator_t *this, gf_defrag_info_t *defrag);
+
+typedef struct gfdb_brick_info {
+ gfdb_time_t *time_stamp;
+ gf_boolean_t _gfdb_promote;
+ query_cbk_args_t *_query_cbk_args;
+} gfdb_brick_info_t;
+
+typedef struct brick_list {
+ xlator_t *xlator;
+ char *brick_db_path;
+ char brick_name[NAME_MAX];
+ char qfile_path[PATH_MAX];
+ struct list_head list;
+} tier_brick_list_t;
+
+typedef struct _dm_thread_args {
+ xlator_t *this;
+ gf_defrag_info_t *defrag;
+ struct list_head *brick_list;
+ int freq_time;
+ int return_value;
+ int is_promotion;
+ int is_compaction;
+ gf_boolean_t is_hot_tier;
+} migration_args_t;
+
+typedef enum tier_watermark_op_ {
+ TIER_WM_NONE = 0,
+ TIER_WM_LOW,
+ TIER_WM_HI,
+ TIER_WM_MID
+} tier_watermark_op_t;
+
+#define DEFAULT_PROMOTE_FREQ_SEC 120
+#define DEFAULT_DEMOTE_FREQ_SEC 120
+#define DEFAULT_HOT_COMPACT_FREQ_SEC 604800
+#define DEFAULT_COLD_COMPACT_FREQ_SEC 604800
+#define DEFAULT_DEMOTE_DEGRADED 1
+#define DEFAULT_WRITE_FREQ_SEC 0
+#define DEFAULT_READ_FREQ_SEC 0
+#define DEFAULT_WM_LOW 75
+#define DEFAULT_WM_HI 90
+#define DEFAULT_TIER_MODE TIER_MODE_TEST
+#define DEFAULT_COMP_MODE _gf_true
+#define DEFAULT_TIER_MAX_MIGRATE_MB 1000
+#define DEFAULT_TIER_MAX_MIGRATE_FILES 5000
+#define DEFAULT_TIER_QUERY_LIMIT 100
+
+#endif
diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am
index 194634b..545c02b 100644
--- a/xlators/features/Makefile.am
+++ b/xlators/features/Makefile.am
@@ -5,6 +5,6 @@ endif
SUBDIRS = locks quota read-only quiesce marker index barrier arbiter upcall \
compress changelog gfid-access snapview-client snapview-server trash \
shard bit-rot leases selinux sdfs namespace $(CLOUDSYNC_DIR) thin-arbiter \
- utime
+ utime changetimerecorder
CLEANFILES =
diff --git a/xlators/features/changetimerecorder/Makefile.am b/xlators/features/changetimerecorder/Makefile.am
new file mode 100644
index 0000000..a985f42
--- /dev/null
+++ b/xlators/features/changetimerecorder/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/changetimerecorder/src/Makefile.am b/xlators/features/changetimerecorder/src/Makefile.am
new file mode 100644
index 0000000..620017e
--- /dev/null
+++ b/xlators/features/changetimerecorder/src/Makefile.am
@@ -0,0 +1,26 @@
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+# changetimerecorder can only get build when libgfdb is enabled
+if BUILD_GFDB
+ xlator_LTLIBRARIES = changetimerecorder.la
+endif
+
+changetimerecorder_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+changetimerecorder_la_SOURCES = changetimerecorder.c \
+ ctr-helper.c ctr-xlator-ctx.c
+
+changetimerecorder_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la\
+ $(top_builddir)/libglusterfs/src/gfdb/libgfdb.la
+
+noinst_HEADERS = ctr-messages.h changetimerecorder.h ctr_mem_types.h \
+ ctr-helper.h ctr-xlator-ctx.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/libglusterfs/src/gfdb \
+ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+ -DDATADIR=\"$(localstatedir)\"
+
+AM_CFLAGS = -Wall $(GF_CFLAGS) $(SQLITE_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/changetimerecorder/src/changetimerecorder.c b/xlators/features/changetimerecorder/src/changetimerecorder.c
new file mode 100644
index 0000000..f2aa4a9
--- /dev/null
+++ b/xlators/features/changetimerecorder/src/changetimerecorder.c
@@ -0,0 +1,2371 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#include <ctype.h>
+#include <sys/uio.h>
+
+#include "gfdb_sqlite3.h"
+#include "ctr-helper.h"
+#include "ctr-messages.h"
+#include <glusterfs/syscall.h>
+
+#include "changetimerecorder.h"
+#include "tier-ctr-interface.h"
+
+/*******************************inode forget***********************************/
+int
+ctr_forget(xlator_t *this, inode_t *inode)
+{
+ fini_ctr_xlator_ctx(this, inode);
+ return 0;
+}
+
+/************************** Look up heal **************************************/
+/*
+Problem: The CTR xlator records file meta (heat/hardlinks)
+into the data. This works fine for files which are created
+after ctr xlator is switched ON. But for files which were
+created before CTR xlator is ON, CTR xlator is not able to
+record either of the meta i.e heat or hardlinks. Thus making
+those files immune to promotions/demotions.
+
+Solution: The solution that is implemented in this patch is
+do ctr-db heal of all those pre-existent files, using named lookup.
+For this purpose we use the inode-xlator context variable option
+in gluster.
+The inode-xlator context variable for ctr xlator will have the
+following,
+ a. A Lock for the context variable
+ b. A hardlink list: This list represents the successful looked
+ up hardlinks.
+These are the scenarios when the hardlink list is updated:
+1) Named-Lookup: Whenever a named lookup happens on a file, in the
+ wind path we copy all required hardlink and inode information to
+ ctr_db_record structure, which resides in the frame->local variable.
+ We don't update the database in wind. During the unwind, we read the
+ information from the ctr_db_record and ,
+ Check if the inode context variable is created, if not we create it.
+ Check if the hard link is there in the hardlink list.
+ If its not there we add it to the list and send a update to the
+ database using libgfdb.
+ Please note: The database transaction can fail(and we ignore) as there
+ already might be a record in the db. This update to the db is to heal
+ if its not there.
+ If its there in the list we ignore it.
+2) Inode Forget: Whenever an inode forget hits we clear the hardlink list in
+ the inode context variable and delete the inode context variable.
+ Please note: An inode forget may happen for two reason,
+ a. when the inode is delete.
+ b. the in-memory inode is evicted from the inode table due to cache limits.
+3) create: whenever a create happens we create the inode context variable and
+ add the hardlink. The database updation is done as usual by ctr.
+4) link: whenever a hardlink is created for the inode, we create the inode
+ context variable, if not present, and add the hardlink to the list.
+5) unlink: whenever a unlink happens we delete the hardlink from the list.
+6) mknod: same as create.
+7) rename: whenever a rename happens we update the hardlink in list. if the
+ hardlink was not present for updation, we add the hardlink to the list.
+
+What is pending:
+1) This solution will only work for named lookups.
+2) We don't track afr-self-heal/dht-rebalancer traffic for healing.
+
+*/
+
+/* This function does not write anything to the db,
+ * just created the local variable
+ * for the frame and sets values for the ctr_db_record */
+static int
+ctr_lookup_wind(call_frame_t *frame, xlator_t *this,
+ gf_ctr_inode_context_t *ctr_inode_cx)
+{
+ int ret = -1;
+ gf_ctr_private_t *_priv = NULL;
+ gf_ctr_local_t *ctr_local = NULL;
+
+ GF_ASSERT(frame);
+ GF_ASSERT(frame->root);
+ GF_ASSERT(this);
+ IS_CTR_INODE_CX_SANE(ctr_inode_cx);
+
+ _priv = this->private;
+ GF_ASSERT(_priv);
+
+ if (_priv->ctr_record_wind && ctr_inode_cx->ia_type != IA_IFDIR) {
+ frame->local = init_ctr_local_t(this);
+ if (!frame->local) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_CREATE_CTR_LOCAL_ERROR_WIND,
+ "WIND: Error while creating ctr local");
+ goto out;
+ };
+ ctr_local = frame->local;
+ /*Definitely no internal fops will reach here*/
+ ctr_local->is_internal_fop = _gf_false;
+ /*Don't record counters*/
+ CTR_DB_REC(ctr_local).do_record_counters = _gf_false;
+ /*Don't record time at all*/
+ CTR_DB_REC(ctr_local).do_record_times = _gf_false;
+
+ /* Copy gfid into db record*/
+ gf_uuid_copy(CTR_DB_REC(ctr_local).gfid, *(ctr_inode_cx->gfid));
+
+ /* Set fop_path and fop_type, required by libgfdb to make
+ * decision while inserting the record */
+ CTR_DB_REC(ctr_local).gfdb_fop_path = ctr_inode_cx->fop_path;
+ CTR_DB_REC(ctr_local).gfdb_fop_type = ctr_inode_cx->fop_type;
+
+ /* Copy hard link info*/
+ gf_uuid_copy(CTR_DB_REC(ctr_local).pargfid,
+ *((NEW_LINK_CX(ctr_inode_cx))->pargfid));
+ if (snprintf(CTR_DB_REC(ctr_local).file_name,
+ sizeof(CTR_DB_REC(ctr_local).file_name), "%s",
+ NEW_LINK_CX(ctr_inode_cx)->basename) >=
+ sizeof(CTR_DB_REC(ctr_local).file_name)) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_CREATE_CTR_LOCAL_ERROR_WIND,
+ "WIND: Error copying filename of ctr local");
+ goto out;
+ }
+ /* Since we are in lookup we can ignore errors while
+ * Inserting in the DB, because there may be many
+ * to write to the DB attempts for healing.
+ * We don't want to log all failed attempts and
+ * bloat the log*/
+ ctr_local->gfdb_db_record.ignore_errors = _gf_true;
+ }
+
+ ret = 0;
+
+out:
+
+ if (ret) {
+ free_ctr_local(ctr_local);
+ frame->local = NULL;
+ }
+
+ return ret;
+}
+
+/* This function inserts the ctr_db_record populated by ctr_lookup_wind
+ * in to the db. It also destroys the frame->local created by ctr_lookup_wind */
+static int
+ctr_lookup_unwind(call_frame_t *frame, xlator_t *this)
+{
+ int ret = -1;
+ gf_ctr_private_t *_priv = NULL;
+ gf_ctr_local_t *ctr_local = NULL;
+
+ GF_ASSERT(frame);
+ GF_ASSERT(this);
+
+ _priv = this->private;
+ GF_ASSERT(_priv);
+
+ GF_ASSERT(_priv->_db_conn);
+
+ ctr_local = frame->local;
+
+ if (ctr_local && (ctr_local->ia_inode_type != IA_IFDIR)) {
+ ret = insert_record(_priv->_db_conn, &ctr_local->gfdb_db_record);
+ if (ret == -1) {
+ gf_msg(this->name,
+ _gfdb_log_level(GF_LOG_ERROR,
+ ctr_local->gfdb_db_record.ignore_errors),
+ 0, CTR_MSG_FILL_CTR_LOCAL_ERROR_UNWIND,
+ "UNWIND: Error filling ctr local");
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ free_ctr_local(ctr_local);
+ frame->local = NULL;
+ return ret;
+}
+
+/******************************************************************************
+ *
+ * FOPS HANDLING BELOW
+ *
+ * ***************************************************************************/
+
+/****************************LOOKUP********************************************/
+
+int32_t
+ctr_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *dict, struct iatt *postparent)
+{
+ int ret = -1;
+ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL;
+ gf_ctr_local_t *ctr_local = NULL;
+ ctr_heal_ret_val_t ret_val = CTR_CTX_ERROR;
+ gf_boolean_t _is_heal_needed = _gf_false;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+
+ /* if the lookup failed lookup don't do anything*/
+ if (op_ret == -1) {
+ gf_msg_trace(this->name, 0, "lookup failed with %s",
+ strerror(op_errno));
+ goto out;
+ }
+
+ /* Ignore directory lookups */
+ if (inode->ia_type == IA_IFDIR) {
+ goto out;
+ }
+
+ /* if frame local was not set by the ctr_lookup()
+ * so don't so anything*/
+ if (!frame->local) {
+ goto out;
+ }
+
+ /* if the lookup is for dht link donot record*/
+ if (dht_is_linkfile(buf, dict)) {
+ gf_msg_trace(this->name, 0,
+ "Ignoring Lookup "
+ "for dht link file");
+ goto out;
+ }
+
+ ctr_local = frame->local;
+ /*Assign the proper inode type*/
+ ctr_local->ia_inode_type = inode->ia_type;
+
+ /* Copy gfid directly from inode */
+ gf_uuid_copy(CTR_DB_REC(ctr_local).gfid, inode->gfid);
+
+ /* Checking if gfid and parent gfid is valid */
+ if (gf_uuid_is_null(CTR_DB_REC(ctr_local).gfid) ||
+ gf_uuid_is_null(CTR_DB_REC(ctr_local).pargfid)) {
+ gf_msg_trace(this->name, 0, "Invalid GFID");
+ goto out;
+ }
+
+ /* if its a first entry
+ * then mark the ctr_record for create
+ * A create will attempt a file and a hard link created in the db*/
+ ctr_xlator_ctx = get_ctr_xlator_ctx(this, inode);
+ if (!ctr_xlator_ctx) {
+ /* This marks inode heal */
+ CTR_DB_REC(ctr_local).gfdb_fop_type = GFDB_FOP_CREATE_WRITE;
+ _is_heal_needed = _gf_true;
+ }
+
+ /* Copy the correct gfid from resolved inode */
+ gf_uuid_copy(CTR_DB_REC(ctr_local).gfid, inode->gfid);
+
+ /* Add hard link to the list */
+ ret_val = add_hard_link_ctx(frame, this, inode);
+ if (ret_val == CTR_CTX_ERROR) {
+ gf_msg_trace(this->name, 0, "Failed adding hardlink to list");
+ goto out;
+ }
+ /* If inode needs healing then heal the hardlink also */
+ else if (ret_val & CTR_TRY_INODE_HEAL) {
+ /* This marks inode heal */
+ CTR_DB_REC(ctr_local).gfdb_fop_type = GFDB_FOP_CREATE_WRITE;
+ _is_heal_needed = _gf_true;
+ }
+ /* If hardlink needs healing */
+ else if (ret_val & CTR_TRY_HARDLINK_HEAL) {
+ _is_heal_needed = _gf_true;
+ }
+
+ /* If lookup heal needed */
+ if (!_is_heal_needed)
+ goto out;
+
+ /* FINALLY HEAL : Inserts the ctr_db_record populated by ctr_lookup_wind
+ * in to the db. It also destroys the frame->local
+ * created by ctr_lookup_wind */
+ ret = ctr_lookup_unwind(frame, this);
+ if (ret) {
+ gf_msg_trace(this->name, 0, "Failed healing/inserting link");
+ }
+
+out:
+ free_ctr_local((gf_ctr_local_t *)frame->local);
+ frame->local = NULL;
+
+ STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, dict,
+ postparent);
+
+ return 0;
+}
+
+int32_t
+ctr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ gf_ctr_inode_context_t ctr_inode_cx;
+ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
+ gf_ctr_link_context_t ctr_link_cx;
+ gf_ctr_link_context_t *_link_cx = &ctr_link_cx;
+ int ret = -1;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
+
+ GF_ASSERT(frame);
+ GF_ASSERT(frame->root);
+
+ /* Don't handle nameless lookups*/
+ if (!loc->parent || !loc->name)
+ goto out;
+
+ /*fill ctr link context*/
+ FILL_CTR_LINK_CX(_link_cx, loc->parent->gfid, loc->name, out);
+
+ /* Fill ctr inode context*/
+ /* IA_IFREG : We assume its a file in the wind
+ * but in the unwind we are sure what the inode is a file
+ * or directory
+ * gfid: we are just filling loc->gfid which is not correct.
+ * In unwind we fill the correct gfid for successful lookup*/
+ FILL_CTR_INODE_CONTEXT(_inode_cx, IA_IFREG, loc->gfid, _link_cx, NULL,
+ GFDB_FOP_DENTRY_WRITE, GFDB_FOP_WIND);
+
+ /* Create the frame->local and populate ctr_db_record
+ * No writing to the db yet */
+ ret = ctr_lookup_wind(frame, this, _inode_cx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_LINK_WIND_FAILED,
+ "Failed to insert link wind");
+ }
+
+out:
+ STACK_WIND(frame, ctr_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, xdata);
+ return 0;
+}
+
+/****************************WRITEV********************************************/
+int32_t
+ctr_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int ret = -1;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
+
+ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_WRITEV_UNWIND_FAILED,
+ "Failed to insert writev unwind");
+ }
+
+out:
+ ctr_free_frame_local(frame);
+
+ STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+
+ return 0;
+}
+
+int32_t
+ctr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t off, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata)
+{
+ int ret = -1;
+ gf_ctr_inode_context_t ctr_inode_cx;
+ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
+
+ /*Fill ctr inode context*/
+ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL,
+ NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
+
+ /*record into the database*/
+ ret = ctr_insert_wind(frame, this, _inode_cx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_WRITEV_WIND_FAILED,
+ "Failed to insert writev wind");
+ }
+
+out:
+ STACK_WIND(frame, ctr_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count, off, flags,
+ iobref, xdata);
+
+ return 0;
+}
+
+/******************************setattr*****************************************/
+
+int32_t
+ctr_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preop_stbuf,
+ struct iatt *postop_stbuf, dict_t *xdata)
+{
+ int ret = -1;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
+
+ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_INSERT_SETATTR_UNWIND_FAILED,
+ "Failed to insert setattr unwind");
+ }
+
+out:
+ ctr_free_frame_local(frame);
+
+ STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, preop_stbuf,
+ postop_stbuf, xdata);
+
+ return 0;
+}
+
+int32_t
+ctr_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+ int32_t valid, dict_t *xdata)
+{
+ int ret = -1;
+ gf_ctr_inode_context_t ctr_inode_cx;
+ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
+ CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, out);
+
+ /*Fill ctr inode context*/
+ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, loc->inode->gfid,
+ NULL, NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
+
+ /*record into the database*/
+ ret = ctr_insert_wind(frame, this, _inode_cx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_SETATTR_WIND_FAILED,
+ "Failed to insert setattr wind");
+ }
+out:
+
+ STACK_WIND(frame, ctr_setattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+
+ return 0;
+}
+
+/*************************** fsetattr ***************************************/
+int32_t
+ctr_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preop_stbuf,
+ struct iatt *postop_stbuf, dict_t *xdata)
+{
+ int ret = -1;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
+
+ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_INSERT_SETATTR_UNWIND_FAILED,
+ "Failed to insert fsetattr unwind");
+ }
+
+out:
+ ctr_free_frame_local(frame);
+
+ STACK_UNWIND_STRICT(fsetattr, frame, op_ret, op_errno, preop_stbuf,
+ postop_stbuf, xdata);
+
+ return 0;
+}
+
+int32_t
+ctr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+ int32_t valid, dict_t *xdata)
+{
+ int ret = -1;
+ gf_ctr_inode_context_t ctr_inode_cx;
+ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
+ CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, out);
+
+ /*Fill ctr inode context*/
+ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL,
+ NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
+
+ /*record into the database*/
+ ret = ctr_insert_wind(frame, this, _inode_cx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_SETATTR_WIND_FAILED,
+ "Failed to insert fsetattr wind");
+ }
+out:
+ STACK_WIND(frame, ctr_fsetattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+
+ return 0;
+}
+/****************************fremovexattr************************************/
+
+int32_t
+ctr_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ int ret = -1;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
+
+ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_INSERT_FREMOVEXATTR_UNWIND_FAILED,
+ "Failed to insert fremovexattr unwind");
+ }
+
+out:
+ ctr_free_frame_local(frame);
+
+ STACK_UNWIND_STRICT(fremovexattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+int32_t
+ctr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ int ret = -1;
+ gf_ctr_inode_context_t ctr_inode_cx;
+ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
+ CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, out);
+
+ /*Fill ctr inode context*/
+ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL,
+ NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
+
+ /*record into the database*/
+ ret = ctr_insert_wind(frame, this, _inode_cx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_INSERT_FREMOVEXATTR_WIND_FAILED,
+ "Failed to insert fremovexattr wind");
+ }
+
+out:
+ STACK_WIND(frame, ctr_fremovexattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+ return 0;
+}
+
+/****************************removexattr*************************************/
+
+int32_t
+ctr_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ int ret = -1;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
+ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
+
+ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_INSERT_REMOVEXATTR_UNWIND_FAILED,
+ "Failed to insert removexattr unwind");
+ }
+
+out:
+ ctr_free_frame_local(frame);
+
+ STACK_UNWIND_STRICT(removexattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+int32_t
+ctr_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ int ret = -1;
+ gf_ctr_inode_context_t ctr_inode_cx;
+ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
+ CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, out);
+
+ /*Fill ctr inode context*/
+ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, loc->inode->gfid,
+ NULL, NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
+
+ /*record into the database*/
+ ret = ctr_insert_wind(frame, this, _inode_cx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_INSERT_REMOVEXATTR_WIND_FAILED,
+ "Failed to insert removexattr wind");
+ }
+
+out:
+ STACK_WIND(frame, ctr_removexattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+ return 0;
+}
+
+/****************************truncate****************************************/
+
+int32_t
+ctr_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int ret = -1;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
+
+ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_INSERT_TRUNCATE_UNWIND_FAILED,
+ "Failed to insert truncate unwind");
+ }
+
+out:
+ ctr_free_frame_local(frame);
+
+ STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+
+ return 0;
+}
+
+int32_t
+ctr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
+{
+ int ret = -1;
+ gf_ctr_inode_context_t ctr_inode_cx;
+ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
+
+ /*Fill ctr inode context*/
+ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, loc->inode->gfid,
+ NULL, NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
+
+ /*record into the database*/
+ ret = ctr_insert_wind(frame, this, _inode_cx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_TRUNCATE_WIND_FAILED,
+ "Failed to insert truncate wind");
+ }
+out:
+ STACK_WIND(frame, ctr_truncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+ return 0;
+}
+
+/****************************ftruncate***************************************/
+
+int32_t
+ctr_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int ret = -1;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
+
+ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_INSERT_FTRUNCATE_UNWIND_FAILED,
+ "Failed to insert ftruncate unwind");
+ }
+
+out:
+ ctr_free_frame_local(frame);
+
+ STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+
+ return 0;
+}
+
+int32_t
+ctr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
+{
+ int ret = -1;
+ gf_ctr_inode_context_t ctr_inode_cx;
+ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
+
+ /*Fill ctr inode context*/
+ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL,
+ NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
+
+ /*record into the database*/
+ ret = ctr_insert_wind(frame, this, _inode_cx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_INSERT_FTRUNCATE_WIND_FAILED,
+ "Failed to insert ftruncate wind");
+ }
+
+out:
+ STACK_WIND(frame, ctr_ftruncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+ return 0;
+}
+
+/****************************rename******************************************/
+int32_t
+ctr_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata)
+{
+ int ret = -1;
+ uint32_t remaining_links = -1;
+ gf_ctr_local_t *ctr_local = NULL;
+ gfdb_fop_type_t fop_type = GFDB_FOP_INVALID_OP;
+ gfdb_fop_path_t fop_path = GFDB_FOP_INVALID;
+
+ GF_ASSERT(frame);
+ GF_ASSERT(this);
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
+
+ ret = ctr_insert_unwind(frame, this, GFDB_FOP_DENTRY_WRITE,
+ GFDB_FOP_UNWIND);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_RENAME_UNWIND_FAILED,
+ "Failed to insert rename unwind");
+ goto out;
+ }
+
+ if (!xdata)
+ goto out;
+ /*
+ *
+ * Extracting GF_RESPONSE_LINK_COUNT_XDATA from POSIX Xlator
+ * This is only set when we are overwriting hardlinks.
+ *
+ * */
+ ret = dict_get_uint32(xdata, GF_RESPONSE_LINK_COUNT_XDATA,
+ &remaining_links);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_GET_CTR_RESPONSE_LINK_COUNT_XDATA_FAILED,
+ "Failed to getting GF_RESPONSE_LINK_COUNT_XDATA");
+ remaining_links = -1;
+ goto out;
+ }
+
+ ctr_local = frame->local;
+ if (!ctr_local) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_NULL_LOCAL,
+ "ctr_local is NULL.");
+ goto out;
+ }
+
+ /* This is not the only link */
+ if (remaining_links > 1) {
+ fop_type = GFDB_FOP_DENTRY_WRITE;
+ fop_path = GFDB_FOP_UNDEL;
+ }
+ /* Last link that was deleted */
+ else if (remaining_links == 1) {
+ fop_type = GFDB_FOP_DENTRY_WRITE;
+ fop_path = GFDB_FOP_UNDEL_ALL;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_RENAME_UNWIND_FAILED,
+ "Invalid link count from posix");
+ goto out;
+ }
+
+ ret = ctr_delete_hard_link_from_db(
+ this, CTR_DB_REC(ctr_local).old_gfid, CTR_DB_REC(ctr_local).pargfid,
+ CTR_DB_REC(ctr_local).file_name, fop_type, fop_path);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_UNLINK_UNWIND_FAILED,
+ "Failed to delete records of %s",
+ CTR_DB_REC(ctr_local).old_file_name);
+ }
+
+out:
+ ctr_free_frame_local(frame);
+
+ STACK_UNWIND_STRICT(rename, frame, op_ret, op_errno, buf, preoldparent,
+ postoldparent, prenewparent, postnewparent, xdata);
+
+ return 0;
+}
+
+int32_t
+ctr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
+{
+ int ret = -1;
+ gf_ctr_inode_context_t ctr_inode_cx;
+ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
+ gf_ctr_link_context_t new_link_cx, old_link_cx;
+ gf_ctr_link_context_t *_nlink_cx = &new_link_cx;
+ gf_ctr_link_context_t *_olink_cx = &old_link_cx;
+ int is_dict_created = 0;
+ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
+
+ /*Fill old link context*/
+ FILL_CTR_LINK_CX(_olink_cx, oldloc->pargfid, oldloc->name, out);
+
+ /*Fill new link context*/
+ FILL_CTR_LINK_CX(_nlink_cx, newloc->pargfid, newloc->name, out);
+
+ /*Fill ctr inode context*/
+ FILL_CTR_INODE_CONTEXT(_inode_cx, oldloc->inode->ia_type,
+ oldloc->inode->gfid, _nlink_cx, _olink_cx,
+ GFDB_FOP_DENTRY_WRITE, GFDB_FOP_WIND);
+
+ /* If the rename is a overwrite of hardlink
+ * rename ("file1", "file2")
+ * file1 is hardlink for gfid say 00000000-0000-0000-0000-00000000000A
+ * file2 is hardlink for gfid say 00000000-0000-0000-0000-00000000000B
+ * so we are saving file2 gfid in old_gfid so that we delete entries
+ * from the db during rename callback if the fop is successful
+ * */
+ if (newloc->inode) {
+ /* This is the GFID from where the newloc hardlink will be
+ * unlinked */
+ _inode_cx->old_gfid = &newloc->inode->gfid;
+ }
+
+ /* Is a metatdata fop */
+ _inode_cx->is_metadata_fop = _gf_true;
+
+ /*record into the database*/
+ ret = ctr_insert_wind(frame, this, _inode_cx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_RENAME_WIND_FAILED,
+ "Failed to insert rename wind");
+ } else {
+ /* We are doing updation of hard link in inode context in wind
+ * As we don't get the "inode" in the call back for rename */
+ ret = update_hard_link_ctx(frame, this, oldloc->inode);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_UPDATE_HARDLINK_FAILED,
+ "Failed "
+ "updating hard link in ctr inode context");
+ goto out;
+ }
+
+ /* If the newloc has an inode. i.e acquiring hardlink of an
+ * exisitng file i.e overwritting a file.
+ * */
+ if (newloc->inode) {
+ /* Getting the ctr inode context variable for
+ * inode whose hardlink will be acquired during
+ * the rename
+ * */
+ ctr_xlator_ctx = get_ctr_xlator_ctx(this, newloc->inode);
+ if (!ctr_xlator_ctx) {
+ /* Since there is no ctr inode context
+ * so nothing more to do */
+ ret = 0;
+ goto out;
+ }
+
+ /* Deleting hardlink from context variable */
+ ret = ctr_delete_hard_link(this, ctr_xlator_ctx, newloc->pargfid,
+ newloc->name);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_DELETE_HARDLINK_FAILED,
+ "Failed to delete hard link");
+ goto out;
+ }
+
+ /* Requesting for number of hardlinks on the newloc
+ * inode from POSIX.
+ * */
+ is_dict_created = set_posix_link_request(this, &xdata);
+ if (is_dict_created == -1) {
+ ret = -1;
+ goto out;
+ }
+ }
+ }
+
+out:
+ STACK_WIND(frame, ctr_rename_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+
+ if (is_dict_created == 1) {
+ dict_unref(xdata);
+ }
+
+ return 0;
+}
+
+/****************************unlink******************************************/
+int32_t
+ctr_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ int ret = -1;
+ uint32_t remaining_links = -1;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
+
+ if (!xdata)
+ goto out;
+
+ /*
+ *
+ * Extracting GF_RESPONSE_LINK_COUNT_XDATA from POSIX Xlator
+ *
+ * */
+ ret = dict_get_uint32(xdata, GF_RESPONSE_LINK_COUNT_XDATA,
+ &remaining_links);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_GET_CTR_RESPONSE_LINK_COUNT_XDATA_FAILED,
+ "Failed to getting GF_RESPONSE_LINK_COUNT_XDATA");
+ remaining_links = -1;
+ }
+
+ /*This is not the only link*/
+ if (remaining_links != 1) {
+ ret = ctr_insert_unwind(frame, this, GFDB_FOP_DENTRY_WRITE,
+ GFDB_FOP_UNDEL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_INSERT_UNLINK_UNWIND_FAILED,
+ "Failed to insert unlink unwind");
+ }
+ }
+ /*Last link that was deleted*/
+ else if (remaining_links == 1) {
+ ret = ctr_insert_unwind(frame, this, GFDB_FOP_DENTRY_WRITE,
+ GFDB_FOP_UNDEL_ALL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_INSERT_UNLINK_UNWIND_FAILED,
+ "Failed to insert unlink unwind");
+ }
+ }
+
+out:
+ ctr_free_frame_local(frame);
+
+ STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent,
+ xdata);
+
+ return 0;
+}
+
+int32_t
+ctr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata)
+{
+ int ret = -1;
+ gf_ctr_inode_context_t ctr_inode_cx;
+ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
+ gf_ctr_link_context_t ctr_link_cx;
+ gf_ctr_link_context_t *_link_cx = &ctr_link_cx;
+ gf_boolean_t is_xdata_created = _gf_false;
+ struct iatt dummy_stat = {0};
+
+ GF_ASSERT(frame);
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+
+ /*Fill link context*/
+ FILL_CTR_LINK_CX(_link_cx, loc->pargfid, loc->name, out);
+
+ /*Fill ctr inode context*/
+ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, loc->inode->gfid,
+ _link_cx, NULL, GFDB_FOP_DENTRY_WRITE,
+ GFDB_FOP_WDEL);
+
+ /*Internal FOP*/
+ _inode_cx->is_internal_fop = is_internal_fop(frame, xdata);
+
+ /* Is a metadata FOP */
+ _inode_cx->is_metadata_fop = _gf_true;
+
+ /* If its a internal FOP and dht link file donot record*/
+ if (_inode_cx->is_internal_fop && dht_is_linkfile(&dummy_stat, xdata)) {
+ goto out;
+ }
+
+ /*record into the database*/
+ ret = ctr_insert_wind(frame, this, _inode_cx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_UNLINK_UNWIND_FAILED,
+ "Failed to insert unlink wind");
+ } else {
+ /* We are doing delete of hard link in inode context in wind
+ * As we don't get the "inode" in the call back for rename */
+ ret = delete_hard_link_ctx(frame, this, loc->inode);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_DELETE_HARDLINK_FAILED,
+ "Failed "
+ "deleting hard link from ctr inode context");
+ }
+ }
+
+ /*
+ *
+ * Sending GF_REQUEST_LINK_COUNT_XDATA
+ * to POSIX Xlator to send link count in unwind path
+ *
+ * */
+ /*create xdata if NULL*/
+ if (!xdata) {
+ xdata = dict_new();
+ is_xdata_created = (xdata) ? _gf_true : _gf_false;
+ }
+ if (!xdata) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_XDATA_NULL,
+ "xdata is NULL :Cannot send "
+ "GF_REQUEST_LINK_COUNT_XDATA to posix");
+ goto out;
+ }
+
+ ret = dict_set_int32(xdata, GF_REQUEST_LINK_COUNT_XDATA, 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_SET_CTR_RESPONSE_LINK_COUNT_XDATA_FAILED,
+ "Failed setting GF_REQUEST_LINK_COUNT_XDATA");
+ if (is_xdata_created) {
+ dict_unref(xdata);
+ }
+ goto out;
+ }
+
+out:
+ STACK_WIND(frame, ctr_unlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
+
+ if (is_xdata_created)
+ dict_unref(xdata);
+
+ return 0;
+}
+
+/****************************fsync******************************************/
+int32_t
+ctr_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
+{
+ int ret = -1;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
+
+ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_FSYNC_UNWIND_FAILED,
+ "Failed to insert fsync unwind");
+ }
+
+out:
+ ctr_free_frame_local(frame);
+
+ STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+
+ return 0;
+}
+
+int32_t
+ctr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ dict_t *xdata)
+{
+ int ret = -1;
+ gf_ctr_inode_context_t ctr_inode_cx;
+ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
+
+ /*Fill ctr inode context*/
+ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL,
+ NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
+
+ /*record into the database*/
+ ret = ctr_insert_wind(frame, this, _inode_cx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_FSYNC_WIND_FAILED,
+ "Failed to insert fsync wind");
+ }
+
+out:
+ STACK_WIND(frame, ctr_fsync_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsync, fd, flags, xdata);
+ return 0;
+}
+
+/****************************setxattr****************************************/
+
+int
+ctr_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ int ret = -1;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+
+ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_FSYNC_UNWIND_FAILED,
+ "Failed to insert setxattr unwind");
+ }
+
+out:
+ ctr_free_frame_local(frame);
+
+ STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+int
+ctr_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr,
+ int flags, dict_t *xdata)
+{
+ int ret = -1;
+ gf_ctr_inode_context_t ctr_inode_cx;
+ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
+ CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, out);
+
+ /*Fill ctr inode context*/
+ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, loc->inode->gfid,
+ NULL, NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
+
+ /*record into the database*/
+ ret = ctr_insert_wind(frame, this, _inode_cx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_SETATTR_WIND_FAILED,
+ "Failed to insert setxattr wind");
+ }
+
+out:
+ STACK_WIND(frame, ctr_setxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr, loc, xattr, flags, xdata);
+ return 0;
+}
+/**************************** fsetxattr *************************************/
+int32_t
+ctr_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ int ret = -1;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
+
+ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_FSYNC_UNWIND_FAILED,
+ "Failed to insert fsetxattr unwind");
+ }
+
+out:
+ ctr_free_frame_local(frame);
+
+ STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+int32_t
+ctr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ int ret = -1;
+ gf_ctr_inode_context_t ctr_inode_cx;
+ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
+ CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, out);
+
+ /*Fill ctr inode context*/
+ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL,
+ NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
+
+ /*record into the database*/
+ ret = ctr_insert_wind(frame, this, _inode_cx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_SETATTR_WIND_FAILED,
+ "Failed to insert fsetxattr wind");
+ }
+
+out:
+ STACK_WIND(frame, ctr_fsetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+ return 0;
+}
+/****************************mknod*******************************************/
+
+int32_t
+ctr_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, inode_t *inode, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ int ret = -1;
+ ctr_heal_ret_val_t ret_val = CTR_CTX_ERROR;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
+
+ /* Add hard link to the list */
+ ret_val = add_hard_link_ctx(frame, this, inode);
+ if (ret_val == CTR_CTX_ERROR) {
+ gf_msg_trace(this->name, 0, "Failed adding hard link");
+ }
+
+ ret = ctr_insert_unwind(frame, this, GFDB_FOP_CREATE_WRITE,
+ GFDB_FOP_UNWIND);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_MKNOD_UNWIND_FAILED,
+ "Failed to insert mknod unwind");
+ }
+
+out:
+ ctr_free_frame_local(frame);
+
+ STACK_UNWIND_STRICT(mknod, frame, op_ret, op_errno, inode, buf, preparent,
+ postparent, xdata);
+
+ return 0;
+}
+
+int
+ctr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t rdev, mode_t umask, dict_t *xdata)
+{
+ int ret = -1;
+ gf_ctr_inode_context_t ctr_inode_cx;
+ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
+ gf_ctr_link_context_t ctr_link_cx;
+ gf_ctr_link_context_t *_link_cx = &ctr_link_cx;
+ uuid_t gfid = {
+ 0,
+ };
+ uuid_t *ptr_gfid = &gfid;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
+
+ GF_ASSERT(frame);
+ GF_ASSERT(frame->root);
+
+ /*get gfid from xdata dict*/
+ ret = dict_get_gfuuid(xdata, "gfid-req", &gfid);
+ if (ret) {
+ gf_msg_debug(this->name, 0, "failed to get gfid from dict");
+ goto out;
+ }
+
+ /*fill ctr link context*/
+ FILL_CTR_LINK_CX(_link_cx, loc->pargfid, loc->name, out);
+
+ /*Fill ctr inode context*/
+ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, *ptr_gfid, _link_cx,
+ NULL, GFDB_FOP_CREATE_WRITE, GFDB_FOP_WIND);
+
+ /*record into the database*/
+ ret = ctr_insert_wind(frame, this, _inode_cx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_MKNOD_WIND_FAILED,
+ "Failed to insert mknod wind");
+ }
+
+out:
+ STACK_WIND(frame, ctr_mknod_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata);
+ return 0;
+}
+
+/****************************create******************************************/
+int
+ctr_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ int ret = -1;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
+
+ ret = add_hard_link_ctx(frame, this, inode);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_ADD_HARDLINK_FAILED,
+ "Failed adding hard link");
+ }
+
+ ret = ctr_insert_unwind(frame, this, GFDB_FOP_CREATE_WRITE,
+ GFDB_FOP_UNWIND);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_CREATE_UNWIND_FAILED,
+ "Failed to insert create unwind");
+ }
+
+out:
+ ctr_free_frame_local(frame);
+
+ STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, stbuf,
+ preparent, postparent, xdata);
+
+ return 0;
+}
+
+int
+ctr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ int ret = -1;
+ gf_ctr_inode_context_t ctr_inode_cx;
+ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
+ gf_ctr_link_context_t ctr_link_cx;
+ gf_ctr_link_context_t *_link_cx = &ctr_link_cx;
+ uuid_t gfid = {
+ 0,
+ };
+ uuid_t *ptr_gfid = &gfid;
+ struct iatt dummy_stat = {0};
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+
+ GF_ASSERT(frame);
+ GF_ASSERT(frame->root);
+
+ /*Get GFID from Xdata dict*/
+ ret = dict_get_gfuuid(xdata, "gfid-req", &gfid);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_GET_GFID_FROM_DICT_FAILED,
+ "failed to get gfid from dict");
+ goto out;
+ }
+
+ /*fill ctr link context*/
+ FILL_CTR_LINK_CX(_link_cx, loc->pargfid, loc->name, out);
+
+ /*Fill ctr inode context*/
+ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, *ptr_gfid, _link_cx,
+ NULL, GFDB_FOP_CREATE_WRITE, GFDB_FOP_WIND);
+
+ /*Internal FOP*/
+ _inode_cx->is_internal_fop = is_internal_fop(frame, xdata);
+
+ /* If its a internal FOP and dht link file donot record*/
+ if (_inode_cx->is_internal_fop && dht_is_linkfile(&dummy_stat, xdata)) {
+ goto out;
+ }
+
+ /*record into the database*/
+ ret = ctr_insert_wind(frame, this, &ctr_inode_cx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_CREATE_WIND_FAILED,
+ "Failed to insert create wind");
+ }
+out:
+ STACK_WIND(frame, ctr_create_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+ xdata);
+ return 0;
+}
+
+/****************************link********************************************/
+
+int
+ctr_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ int ret = -1;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
+
+ /* Add hard link to the list */
+ ret = add_hard_link_ctx(frame, this, inode);
+ if (ret) {
+ gf_msg_trace(this->name, 0, "Failed adding hard link");
+ }
+
+ ret = ctr_insert_unwind(frame, this, GFDB_FOP_DENTRY_WRITE,
+ GFDB_FOP_UNWIND);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_CREATE_UNWIND_FAILED,
+ "Failed to insert create unwind");
+ }
+
+out:
+ ctr_free_frame_local(frame);
+
+ STACK_UNWIND_STRICT(link, frame, op_ret, op_errno, inode, stbuf, preparent,
+ postparent, xdata);
+ return 0;
+}
+
+int
+ctr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
+{
+ int ret = -1;
+ gf_ctr_inode_context_t ctr_inode_cx;
+ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
+ gf_ctr_link_context_t ctr_link_cx;
+ gf_ctr_link_context_t *_link_cx = &ctr_link_cx;
+ struct iatt dummy_stat = {0};
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+
+ GF_ASSERT(frame);
+ GF_ASSERT(frame->root);
+
+ /*fill ctr link context*/
+ FILL_CTR_LINK_CX(_link_cx, newloc->pargfid, newloc->name, out);
+
+ /*Fill ctr inode context*/
+ FILL_CTR_INODE_CONTEXT(_inode_cx, oldloc->inode->ia_type,
+ oldloc->inode->gfid, _link_cx, NULL,
+ GFDB_FOP_DENTRY_WRITE, GFDB_FOP_WIND);
+
+ /*Internal FOP*/
+ _inode_cx->is_internal_fop = is_internal_fop(frame, xdata);
+
+ /* Is a metadata fop */
+ _inode_cx->is_metadata_fop = _gf_true;
+
+ /* If its a internal FOP and dht link file donot record*/
+ if (_inode_cx->is_internal_fop && dht_is_linkfile(&dummy_stat, xdata)) {
+ goto out;
+ }
+
+ /*record into the database*/
+ ret = ctr_insert_wind(frame, this, _inode_cx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_LINK_WIND_FAILED,
+ "Failed to insert link wind");
+ }
+
+out:
+ STACK_WIND(frame, ctr_link_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+ return 0;
+}
+
+/******************************readv*****************************************/
+int
+ctr_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iovec *vector, int count, struct iatt *stbuf,
+ struct iobref *iobref, dict_t *xdata)
+{
+ int ret = -1;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
+
+ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_READ, GFDB_FOP_UNWIND);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_CREATE_UNWIND_FAILED,
+ "Failed to insert create unwind");
+ }
+
+out:
+ ctr_free_frame_local(frame);
+
+ STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, stbuf,
+ iobref, xdata);
+ return 0;
+}
+
+int
+ctr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t off,
+ uint32_t flags, dict_t *xdata)
+{
+ int ret = -1;
+ gf_ctr_inode_context_t ctr_inode_cx;
+ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
+
+ CTR_IS_DISABLED_THEN_GOTO(this, out);
+ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
+
+ /*Fill ctr inode context*/
+ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL,
+ NULL, GFDB_FOP_INODE_READ, GFDB_FOP_WIND);
+
+ /*record into the database*/
+ ret = ctr_insert_wind(frame, this, _inode_cx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_READV_WIND_FAILED,
+ "Failed to insert readv wind");
+ }
+
+out:
+ STACK_WIND(frame, ctr_readv_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv, fd, size, off, flags, xdata);
+ return 0;
+}
+
+/*******************************ctr_ipc****************************************/
+
+/*This is the call back function per record/file from data base*/
+static int
+ctr_db_query_callback(gfdb_query_record_t *gfdb_query_record, void *args)
+{
+ int ret = -1;
+ ctr_query_cbk_args_t *query_cbk_args = args;
+
+ GF_VALIDATE_OR_GOTO("ctr", query_cbk_args, out);
+
+ ret = gfdb_write_query_record(query_cbk_args->query_fd, gfdb_query_record);
+ if (ret) {
+ gf_msg("ctr", GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR,
+ "Failed to write to query file");
+ goto out;
+ }
+
+ query_cbk_args->count++;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/* This function does all the db queries related to tiering and
+ * generates/populates new/existing query file
+ * inputs:
+ * xlator_t *this : CTR Translator
+ * void *conn_node : Database connection
+ * char *query_file: the query file that needs to be updated
+ * gfdb_ipc_ctr_params_t *ipc_ctr_params: the query parameters
+ * Return:
+ * On success 0
+ * On failure -1
+ * */
+int
+ctr_db_query(xlator_t *this, void *conn_node, char *query_file,
+ gfdb_ipc_ctr_params_t *ipc_ctr_params)
+{
+ int ret = -1;
+ ctr_query_cbk_args_t query_cbk_args = {0};
+
+ GF_VALIDATE_OR_GOTO("ctr", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, conn_node, out);
+ GF_VALIDATE_OR_GOTO(this->name, query_file, out);
+ GF_VALIDATE_OR_GOTO(this->name, ipc_ctr_params, out);
+
+ /*Query for eligible files from db*/
+ query_cbk_args.query_fd = open(query_file, O_WRONLY | O_CREAT | O_APPEND,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+ if (query_cbk_args.query_fd < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, errno, CTR_MSG_FATAL_ERROR,
+ "Failed to open query file %s", query_file);
+ goto out;
+ }
+ if (!ipc_ctr_params->is_promote) {
+ if (ipc_ctr_params->emergency_demote) {
+ /* emergency demotion mode */
+ ret = find_all(conn_node, ctr_db_query_callback,
+ (void *)&query_cbk_args,
+ ipc_ctr_params->query_limit);
+ } else {
+ if (ipc_ctr_params->write_freq_threshold == 0 &&
+ ipc_ctr_params->read_freq_threshold == 0) {
+ ret = find_unchanged_for_time(conn_node, ctr_db_query_callback,
+ (void *)&query_cbk_args,
+ &ipc_ctr_params->time_stamp);
+ } else {
+ ret = find_unchanged_for_time_freq(
+ conn_node, ctr_db_query_callback, (void *)&query_cbk_args,
+ &ipc_ctr_params->time_stamp,
+ ipc_ctr_params->write_freq_threshold,
+ ipc_ctr_params->read_freq_threshold, _gf_false);
+ }
+ }
+ } else {
+ if (ipc_ctr_params->write_freq_threshold == 0 &&
+ ipc_ctr_params->read_freq_threshold == 0) {
+ ret = find_recently_changed_files(conn_node, ctr_db_query_callback,
+ (void *)&query_cbk_args,
+ &ipc_ctr_params->time_stamp);
+ } else {
+ ret = find_recently_changed_files_freq(
+ conn_node, ctr_db_query_callback, (void *)&query_cbk_args,
+ &ipc_ctr_params->time_stamp,
+ ipc_ctr_params->write_freq_threshold,
+ ipc_ctr_params->read_freq_threshold, _gf_false);
+ }
+ }
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR,
+ "FATAL: query from db failed");
+ goto out;
+ }
+
+ ret = clear_files_heat(conn_node);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR,
+ "FATAL: Failed to clear db entries");
+ goto out;
+ }
+
+ ret = 0;
+out:
+
+ if (!ret)
+ ret = query_cbk_args.count;
+
+ if (query_cbk_args.query_fd >= 0) {
+ sys_close(query_cbk_args.query_fd);
+ query_cbk_args.query_fd = -1;
+ }
+
+ return ret;
+}
+
+void *
+ctr_compact_thread(void *args)
+{
+ int ret = -1;
+ void *db_conn = NULL;
+
+ xlator_t *this = NULL;
+ gf_ctr_private_t *priv = NULL;
+ gf_boolean_t compact_active = _gf_false;
+ gf_boolean_t compact_mode_switched = _gf_false;
+
+ this = (xlator_t *)args;
+
+ GF_VALIDATE_OR_GOTO("ctr", this, out);
+
+ priv = this->private;
+
+ db_conn = priv->_db_conn;
+ compact_active = priv->compact_active;
+ compact_mode_switched = priv->compact_mode_switched;
+
+ gf_msg("ctr-compact", GF_LOG_INFO, 0, CTR_MSG_SET, "Starting compaction");
+
+ ret = compact_db(db_conn, compact_active, compact_mode_switched);
+
+ if (ret) {
+ gf_msg("ctr-compact", GF_LOG_ERROR, 0, CTR_MSG_SET,
+ "Failed to perform the compaction");
+ }
+
+ ret = pthread_mutex_lock(&priv->compact_lock);
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
+ "Failed to acquire lock");
+ goto out;
+ }
+
+ /* We are done compaction on this brick. Set all flags to false */
+ priv->compact_active = _gf_false;
+ priv->compact_mode_switched = _gf_false;
+
+ ret = pthread_mutex_unlock(&priv->compact_lock);
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
+ "Failed to release lock");
+ goto out;
+ }
+
+out:
+ return NULL;
+}
+
+int
+ctr_ipc_helper(xlator_t *this, dict_t *in_dict, dict_t *out_dict)
+{
+ int ret = -1;
+ char *ctr_ipc_ops = NULL;
+ gf_ctr_private_t *priv = NULL;
+ char *db_version = NULL;
+ char *db_param_key = NULL;
+ char *db_param = NULL;
+ char *query_file = NULL;
+ gfdb_ipc_ctr_params_t *ipc_ctr_params = NULL;
+ int result = 0;
+ pthread_t compact_thread;
+
+ GF_VALIDATE_OR_GOTO("ctr", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+ priv = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, priv->_db_conn, out);
+ GF_VALIDATE_OR_GOTO(this->name, in_dict, out);
+ GF_VALIDATE_OR_GOTO(this->name, out_dict, out);
+
+ GET_DB_PARAM_FROM_DICT(this->name, in_dict, GFDB_IPC_CTR_KEY, ctr_ipc_ops,
+ out);
+
+ /*if its a db clear operation */
+ if (strncmp(ctr_ipc_ops, GFDB_IPC_CTR_CLEAR_OPS,
+ SLEN(GFDB_IPC_CTR_CLEAR_OPS)) == 0) {
+ ret = clear_files_heat(priv->_db_conn);
+ if (ret)
+ goto out;
+
+ } /* if its a query operation, in which case its query + clear db*/
+ else if (strncmp(ctr_ipc_ops, GFDB_IPC_CTR_QUERY_OPS,
+ SLEN(GFDB_IPC_CTR_QUERY_OPS)) == 0) {
+ ret = dict_get_str(in_dict, GFDB_IPC_CTR_GET_QFILE_PATH, &query_file);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
+ "Failed extracting query file path");
+ goto out;
+ }
+
+ ret = dict_get_bin(in_dict, GFDB_IPC_CTR_GET_QUERY_PARAMS,
+ (void *)&ipc_ctr_params);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
+ "Failed extracting query parameters");
+ goto out;
+ }
+
+ ret = ctr_db_query(this, priv->_db_conn, query_file, ipc_ctr_params);
+
+ ret = dict_set_int32(out_dict, GFDB_IPC_CTR_RET_QUERY_COUNT, ret);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
+ "Failed setting query reply");
+ goto out;
+ }
+
+ } /* if its a query for db version */
+ else if (strncmp(ctr_ipc_ops, GFDB_IPC_CTR_GET_DB_VERSION_OPS,
+ SLEN(GFDB_IPC_CTR_GET_DB_VERSION_OPS)) == 0) {
+ ret = get_db_version(priv->_db_conn, &db_version);
+ if (ret == -1 || !db_version) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
+ "Failed extracting db version ");
+ goto out;
+ }
+
+ SET_DB_PARAM_TO_DICT(this->name, out_dict, GFDB_IPC_CTR_RET_DB_VERSION,
+ db_version, ret, error);
+
+ } /* if its a query for a db setting */
+ else if (strncmp(ctr_ipc_ops, GFDB_IPC_CTR_GET_DB_PARAM_OPS,
+ SLEN(GFDB_IPC_CTR_GET_DB_PARAM_OPS)) == 0) {
+ ret = dict_get_str(in_dict, GFDB_IPC_CTR_GET_DB_KEY, &db_param_key);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
+ "Failed extracting db param key");
+ goto out;
+ }
+
+ ret = get_db_params(priv->_db_conn, db_param_key, &db_param);
+ if (ret == -1 || !db_param) {
+ goto out;
+ }
+
+ SET_DB_PARAM_TO_DICT(this->name, out_dict, db_param_key, db_param, ret,
+ error);
+ } /* if its an attempt to compact the database */
+ else if (strncmp(ctr_ipc_ops, GFDB_IPC_CTR_SET_COMPACT_PRAGMA,
+ SLEN(GFDB_IPC_CTR_SET_COMPACT_PRAGMA)) == 0) {
+ ret = pthread_mutex_lock(&priv->compact_lock);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
+ "Failed to acquire lock for compaction");
+ goto out;
+ }
+
+ if ((priv->compact_active || priv->compact_mode_switched)) {
+ /* Compaction in progress. LEAVE */
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
+ "Compaction already in progress.");
+ pthread_mutex_unlock(&priv->compact_lock);
+ goto out;
+ }
+ /* At this point, we should be the only one on the brick */
+ /* compacting */
+
+ /* Grab the arguments from the dictionary */
+ ret = dict_get_int32(in_dict, "compact_active", &result);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
+ "Failed to get compaction type");
+ goto out;
+ }
+
+ if (result) {
+ priv->compact_active = _gf_true;
+ }
+
+ ret = dict_get_int32(in_dict, "compact_mode_switched", &result);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
+ "Failed to see if compaction switched");
+ goto out;
+ }
+
+ if (result) {
+ priv->compact_mode_switched = _gf_true;
+ gf_msg("ctr-compact", GF_LOG_TRACE, 0, CTR_MSG_SET,
+ "Pre-thread: Compact mode switch is true");
+ } else {
+ gf_msg("ctr-compact", GF_LOG_TRACE, 0, CTR_MSG_SET,
+ "Pre-thread: Compact mode switch is false");
+ }
+
+ ret = pthread_mutex_unlock(&priv->compact_lock);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
+ "Failed to release lock for compaction");
+ goto out;
+ }
+
+ ret = gf_thread_create(&compact_thread, NULL, ctr_compact_thread,
+ (void *)this, "ctrcomp");
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
+ "Failed to spawn compaction thread");
+ goto out;
+ }
+
+ goto out;
+ } /* default case */
+ else {
+ goto out;
+ }
+
+ ret = 0;
+ goto out;
+error:
+ GF_FREE(db_param_key);
+ GF_FREE(db_param);
+ GF_FREE(db_version);
+out:
+ return ret;
+}
+
+/* IPC Call from tier migrator to clear the heat on the DB */
+int32_t
+ctr_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *in_dict)
+{
+ int ret = -1;
+ gf_ctr_private_t *priv = NULL;
+ dict_t *out_dict = NULL;
+
+ GF_ASSERT(this);
+ priv = this->private;
+ GF_ASSERT(priv);
+ GF_ASSERT(priv->_db_conn);
+ GF_VALIDATE_OR_GOTO(this->name, in_dict, wind);
+
+ if (op != GF_IPC_TARGET_CTR)
+ goto wind;
+
+ out_dict = dict_new();
+ if (!out_dict) {
+ goto out;
+ }
+
+ ret = ctr_ipc_helper(this, in_dict, out_dict);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
+ "Failed in ctr_ipc_helper");
+ }
+out:
+
+ STACK_UNWIND_STRICT(ipc, frame, ret, 0, out_dict);
+
+ if (out_dict)
+ dict_unref(out_dict);
+
+ return 0;
+
+wind:
+ STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ipc, op, in_dict);
+
+ return 0;
+}
+
+/* Call to initialize db for ctr xlator while ctr is enabled */
+int32_t
+initialize_ctr_resource(xlator_t *this, gf_ctr_private_t *priv)
+{
+ int ret_db = -1;
+ dict_t *params_dict = NULL;
+
+ if (!priv)
+ goto error;
+
+ /* For compaction */
+ priv->compact_active = _gf_false;
+ priv->compact_mode_switched = _gf_false;
+ ret_db = pthread_mutex_init(&priv->compact_lock, NULL);
+
+ if (ret_db) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR,
+ "FATAL: Failed initializing compaction mutex");
+ goto error;
+ }
+
+ params_dict = dict_new();
+ if (!params_dict) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INIT_DB_PARAMS_FAILED,
+ "DB Params cannot initialized!");
+ goto error;
+ }
+
+ /*Extract db params options*/
+ ret_db = extract_db_params(this, params_dict, priv->gfdb_db_type);
+ if (ret_db) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_EXTRACT_DB_PARAM_OPTIONS_FAILED,
+ "Failed extracting db params options");
+ goto error;
+ }
+
+ /*Create a memory pool for ctr xlator*/
+ this->local_pool = mem_pool_new(gf_ctr_local_t, 64);
+ if (!this->local_pool) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_CREATE_LOCAL_MEMORY_POOL_FAILED,
+ "failed to create local memory pool");
+ goto error;
+ }
+
+ /*Initialize Database Connection*/
+ priv->_db_conn = init_db(params_dict, priv->gfdb_db_type);
+ if (!priv->_db_conn) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR,
+ "FATAL: Failed initializing data base");
+ goto error;
+ }
+
+ ret_db = 0;
+ goto out;
+
+error:
+ if (this)
+ mem_pool_destroy(this->local_pool);
+
+ if (priv) {
+ GF_FREE(priv->ctr_db_path);
+ }
+ GF_FREE(priv);
+ ret_db = -1;
+out:
+ if (params_dict)
+ dict_unref(params_dict);
+
+ return ret_db;
+}
+
+/******************************************************************************/
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+ char *temp_str = NULL;
+ int ret = 0;
+ gf_ctr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ if (dict_get_str(options, "changetimerecorder.frequency", &temp_str)) {
+ gf_msg(this->name, GF_LOG_TRACE, 0, CTR_MSG_SET, "set");
+ }
+
+ GF_OPTION_RECONF("ctr-enabled", priv->enabled, options, bool, out);
+ if (!priv->enabled) {
+ gf_msg(GFDB_DATA_STORE, GF_LOG_INFO, 0, CTR_MSG_XLATOR_DISABLED,
+ "CTR Xlator is not enabled so skip ctr reconfigure");
+ goto out;
+ }
+
+ /* If ctr is enabled after skip init for ctr xlator then call
+ initialize_ctr_resource during reconfigure phase to allocate resources
+ for xlator
+ */
+ if (priv->enabled && !priv->_db_conn) {
+ ret = initialize_ctr_resource(this, priv);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR,
+ "FATAL: Failed ctr initialize resource");
+ goto out;
+ }
+ }
+
+ GF_OPTION_RECONF("record-counters", priv->ctr_record_counter, options, bool,
+ out);
+
+ GF_OPTION_RECONF("ctr-record-metadata-heat", priv->ctr_record_metadata_heat,
+ options, bool, out);
+
+ GF_OPTION_RECONF("ctr_link_consistency", priv->ctr_link_consistency,
+ options, bool, out);
+
+ GF_OPTION_RECONF("ctr_lookupheal_inode_timeout",
+ priv->ctr_lookupheal_inode_timeout, options, uint64, out);
+
+ GF_OPTION_RECONF("ctr_lookupheal_link_timeout",
+ priv->ctr_lookupheal_link_timeout, options, uint64, out);
+
+ GF_OPTION_RECONF("record-exit", priv->ctr_record_unwind, options, bool,
+ out);
+
+ GF_OPTION_RECONF("record-entry", priv->ctr_record_wind, options, bool, out);
+
+ /* If database is sqlite */
+ if (priv->gfdb_db_type == GFDB_SQLITE3) {
+ /* AUTOCHECKPOINT */
+ if (dict_get_str(options, GFDB_SQL_PARAM_WAL_AUTOCHECK, &temp_str) ==
+ 0) {
+ ret = set_db_params(priv->_db_conn, "wal_autocheckpoint", temp_str);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_SET_VALUE_TO_SQL_PARAM_FAILED,
+ "Failed to set %s", GFDB_SQL_PARAM_WAL_AUTOCHECK);
+ }
+ }
+
+ /* CACHE_SIZE */
+ if (dict_get_str(options, GFDB_SQL_PARAM_CACHE_SIZE, &temp_str) == 0) {
+ ret = set_db_params(priv->_db_conn, "cache_size", temp_str);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_SET_VALUE_TO_SQL_PARAM_FAILED,
+ "Failed to set %s", GFDB_SQL_PARAM_CACHE_SIZE);
+ }
+ }
+ }
+
+ ret = 0;
+
+out:
+
+ return ret;
+}
+
+/****************************init********************************************/
+
+int32_t
+init(xlator_t *this)
+{
+ gf_ctr_private_t *priv = NULL;
+ int ret_db = -1;
+
+ if (!this) {
+ gf_msg("ctr", GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR,
+ "FATAL: ctr this is not initialized");
+ return -1;
+ }
+
+ if (!this->children || this->children->next) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR,
+ "FATAL: ctr should have exactly one child");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, CTR_MSG_DANGLING_VOLUME,
+ "dangling volume. check volfile ");
+ }
+
+ priv = GF_CALLOC(1, sizeof(*priv), gf_ctr_mt_private_t);
+ if (!priv) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, CTR_MSG_CALLOC_FAILED,
+ "Calloc did not work!!!");
+ return -1;
+ }
+
+ /*Default values for the translator*/
+ priv->ctr_record_wind = _gf_true;
+ priv->ctr_record_unwind = _gf_false;
+ priv->ctr_hot_brick = _gf_false;
+ priv->gfdb_db_type = GFDB_SQLITE3;
+ priv->gfdb_sync_type = GFDB_DB_SYNC;
+ priv->_db_conn = NULL;
+ priv->ctr_lookupheal_link_timeout = CTR_DEFAULT_HARDLINK_EXP_PERIOD;
+ priv->ctr_lookupheal_inode_timeout = CTR_DEFAULT_INODE_EXP_PERIOD;
+
+ /*Extract ctr xlator options*/
+ ret_db = extract_ctr_options(this, priv);
+ if (ret_db) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_EXTRACT_CTR_XLATOR_OPTIONS_FAILED,
+ "Failed extracting ctr xlator options");
+ GF_FREE(priv);
+ return -1;
+ }
+
+ if (!priv->enabled) {
+ gf_msg(GFDB_DATA_STORE, GF_LOG_INFO, 0, CTR_MSG_XLATOR_DISABLED,
+ "CTR Xlator is not enabled so skip ctr init");
+ goto out;
+ }
+
+ ret_db = initialize_ctr_resource(this, priv);
+ if (ret_db) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR,
+ "FATAL: Failed ctr initialize resource");
+ return -1;
+ }
+
+out:
+ this->private = (void *)priv;
+ return 0;
+}
+
+int
+notify(xlator_t *this, int event, void *data, ...)
+{
+ gf_ctr_private_t *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ if (!priv)
+ goto out;
+
+ ret = default_notify(this, event, data);
+
+out:
+ return ret;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO("ctr", this, out);
+
+ ret = xlator_mem_acct_init(this, gf_ctr_mt_end + 1);
+
+ if (ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_MEM_ACC_INIT_FAILED,
+ "Memory accounting init"
+ "failed");
+ return ret;
+ }
+out:
+ return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+ gf_ctr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ if (priv && priv->enabled) {
+ if (fini_db(priv->_db_conn)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, CTR_MSG_CLOSE_DB_CONN_FAILED,
+ "Failed closing "
+ "db connection");
+ }
+
+ if (priv->_db_conn)
+ priv->_db_conn = NULL;
+
+ GF_FREE(priv->ctr_db_path);
+ if (pthread_mutex_destroy(&priv->compact_lock)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, CTR_MSG_CLOSE_DB_CONN_FAILED,
+ "Failed to "
+ "destroy the compaction mutex");
+ }
+ }
+ GF_FREE(priv);
+ mem_pool_destroy(this->local_pool);
+ this->local_pool = NULL;
+
+ return;
+}
+
+struct xlator_fops fops = {
+ /*lookup*/
+ .lookup = ctr_lookup,
+ /*write fops */
+ .mknod = ctr_mknod,
+ .create = ctr_create,
+ .truncate = ctr_truncate,
+ .ftruncate = ctr_ftruncate,
+ .setxattr = ctr_setxattr,
+ .fsetxattr = ctr_fsetxattr,
+ .removexattr = ctr_removexattr,
+ .fremovexattr = ctr_fremovexattr,
+ .unlink = ctr_unlink,
+ .link = ctr_link,
+ .rename = ctr_rename,
+ .writev = ctr_writev,
+ .setattr = ctr_setattr,
+ .fsetattr = ctr_fsetattr,
+ /*read fops*/
+ .readv = ctr_readv,
+ /* IPC call*/
+ .ipc = ctr_ipc};
+
+struct xlator_cbks cbks = {.forget = ctr_forget};
+
+struct volume_options options[] = {
+ {.key =
+ {
+ "ctr-enabled",
+ },
+ .type = GF_OPTION_TYPE_BOOL,
+ .value = {"on", "off"},
+ .default_value = "off",
+ .description = "Enables the CTR",
+ .flags = OPT_FLAG_SETTABLE},
+ {.key = {"record-entry"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .value = {"on", "off"},
+ .default_value = "on"},
+ {.key = {"record-exit"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .value = {"on", "off"},
+ .default_value = "off"},
+ {.key = {"record-counters"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .value = {"on", "off"},
+ .default_value = "off",
+ .op_version = {GD_OP_VERSION_3_7_0},
+ .flags = OPT_FLAG_SETTABLE,
+ .tags = {}},
+ {.key = {"ctr-record-metadata-heat"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .value = {"on", "off"},
+ .default_value = "off",
+ .flags = OPT_FLAG_SETTABLE,
+ .op_version = {GD_OP_VERSION_3_7_0},
+ .tags = {}},
+ {.key = {"ctr_link_consistency"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .value = {"on", "off"},
+ .default_value = "off",
+ .flags = OPT_FLAG_SETTABLE,
+ .op_version = {GD_OP_VERSION_3_7_0},
+ .tags = {}},
+ {.key = {"ctr_lookupheal_link_timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "300",
+ .flags = OPT_FLAG_SETTABLE,
+ .op_version = {GD_OP_VERSION_3_7_2},
+ .tags = {}},
+ {.key = {"ctr_lookupheal_inode_timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "300",
+ .flags = OPT_FLAG_SETTABLE,
+ .op_version = {GD_OP_VERSION_3_7_2},
+ .tags = {}},
+ {.key = {"hot-brick"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .value = {"on", "off"},
+ .default_value = "off"},
+ {.key = {"db-type"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"hashfile", "rocksdb", "changelog", "sqlite3", "hyperdex"},
+ .default_value = "sqlite3",
+ .op_version = {GD_OP_VERSION_3_7_0},
+ .flags = OPT_FLAG_SETTABLE,
+ .tags = {}},
+ {.key = {"db-sync"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"sync", "async"},
+ .default_value = "sync"},
+ {.key = {"db-path"}, .type = GF_OPTION_TYPE_PATH},
+ {.key = {"db-name"}, .type = GF_OPTION_TYPE_STR},
+ {.key = {GFDB_SQL_PARAM_SYNC},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"off", "normal", "full"},
+ .default_value = "normal"},
+ {.key = {GFDB_SQL_PARAM_JOURNAL_MODE},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"delete", "truncate", "persist", "memory", "wal", "off"},
+ .default_value = "wal",
+ .flags = OPT_FLAG_SETTABLE,
+ .op_version = {GD_OP_VERSION_3_7_0},
+ .tags = {}},
+ {.key = {GFDB_SQL_PARAM_AUTO_VACUUM},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"off", "full", "incr"},
+ .default_value = "off",
+ .flags = OPT_FLAG_SETTABLE,
+ .op_version = {GD_OP_VERSION_3_7_0},
+ .tags = {}},
+ {.key = {GFDB_SQL_PARAM_WAL_AUTOCHECK},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "25000",
+ .flags = OPT_FLAG_SETTABLE,
+ .op_version = {GD_OP_VERSION_3_7_0},
+ .tags = {}},
+ {.key = {GFDB_SQL_PARAM_CACHE_SIZE},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "12500",
+ .flags = OPT_FLAG_SETTABLE,
+ .op_version = {GD_OP_VERSION_3_7_0},
+ .tags = {}},
+ {.key = {GFDB_SQL_PARAM_PAGE_SIZE},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "4096",
+ .flags = OPT_FLAG_SETTABLE,
+ .op_version = {GD_OP_VERSION_3_7_0},
+ .tags = {}},
+ {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .notify = notify,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {GD_OP_VERSION_3_7_0}, /* Present from the initial version */
+ .fops = &fops,
+ .cbks = &cbks,
+ .identifier = "changetimerecorder",
+ .category = GF_MAINTAINED,
+ .options = options,
+};
diff --git a/xlators/features/changetimerecorder/src/changetimerecorder.h b/xlators/features/changetimerecorder/src/changetimerecorder.h
new file mode 100644
index 0000000..0150a1c
--- /dev/null
+++ b/xlators/features/changetimerecorder/src/changetimerecorder.h
@@ -0,0 +1,21 @@
+/*
+ Copyright (c) 2006-2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CTR_H
+#define __CTR_H
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/common-utils.h>
+#include "ctr_mem_types.h"
+#include "ctr-helper.h"
+
+#endif /* __CTR_H */
diff --git a/xlators/features/changetimerecorder/src/ctr-helper.c b/xlators/features/changetimerecorder/src/ctr-helper.c
new file mode 100644
index 0000000..e1e6573
--- /dev/null
+++ b/xlators/features/changetimerecorder/src/ctr-helper.c
@@ -0,0 +1,293 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "gfdb_sqlite3.h"
+#include "ctr-helper.h"
+#include "ctr-messages.h"
+
+/*******************************************************************************
+ *
+ * Fill unwind into db record
+ *
+ ******************************************************************************/
+int
+fill_db_record_for_unwind(xlator_t *this, gf_ctr_local_t *ctr_local,
+ gfdb_fop_type_t fop_type, gfdb_fop_path_t fop_path)
+{
+ int ret = -1;
+ gfdb_time_t *ctr_uwtime = NULL;
+ gf_ctr_private_t *_priv = NULL;
+
+ GF_ASSERT(this);
+ _priv = this->private;
+ GF_ASSERT(_priv);
+
+ GF_ASSERT(ctr_local);
+
+ /*If not unwind path error*/
+ if (!isunwindpath(fop_path)) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_WRONG_FOP_PATH,
+ "Wrong fop_path. Should be unwind");
+ goto out;
+ }
+
+ ctr_uwtime = &CTR_DB_REC(ctr_local).gfdb_unwind_change_time;
+ CTR_DB_REC(ctr_local).gfdb_fop_path = fop_path;
+ CTR_DB_REC(ctr_local).gfdb_fop_type = fop_type;
+
+ ret = gettimeofday(ctr_uwtime, NULL);
+ if (ret == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, errno,
+ CTR_MSG_FILL_UNWIND_TIME_REC_ERROR,
+ "Error "
+ "filling unwind time record %s",
+ strerror(errno));
+ goto out;
+ }
+
+ /* Special case i.e if its a tier rebalance
+ * + cold tier brick
+ * + its a create/mknod FOP
+ * we record unwind time as zero */
+ if (ctr_local->client_pid == GF_CLIENT_PID_TIER_DEFRAG &&
+ (!_priv->ctr_hot_brick) && isdentrycreatefop(fop_type)) {
+ memset(ctr_uwtime, 0, sizeof(*ctr_uwtime));
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+/*******************************************************************************
+ *
+ * Fill wind into db record
+ *
+ ******************************************************************************/
+int
+fill_db_record_for_wind(xlator_t *this, gf_ctr_local_t *ctr_local,
+ gf_ctr_inode_context_t *ctr_inode_cx)
+{
+ int ret = -1;
+ gfdb_time_t *ctr_wtime = NULL;
+ gf_ctr_private_t *_priv = NULL;
+
+ GF_ASSERT(this);
+ _priv = this->private;
+ GF_ASSERT(_priv);
+ GF_ASSERT(ctr_local);
+ IS_CTR_INODE_CX_SANE(ctr_inode_cx);
+
+ /*if not wind path error!*/
+ if (!iswindpath(ctr_inode_cx->fop_path)) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_WRONG_FOP_PATH,
+ "Wrong fop_path. Should be wind");
+ goto out;
+ }
+
+ ctr_wtime = &CTR_DB_REC(ctr_local).gfdb_wind_change_time;
+ CTR_DB_REC(ctr_local).gfdb_fop_path = ctr_inode_cx->fop_path;
+ CTR_DB_REC(ctr_local).gfdb_fop_type = ctr_inode_cx->fop_type;
+ CTR_DB_REC(ctr_local).link_consistency = _priv->ctr_link_consistency;
+
+ ret = gettimeofday(ctr_wtime, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, errno,
+ CTR_MSG_FILL_UNWIND_TIME_REC_ERROR,
+ "Error filling wind time record %s", strerror(errno));
+ goto out;
+ }
+
+ /* Special case i.e if its a tier rebalance
+ * + cold tier brick
+ * + its a create/mknod FOP
+ * we record wind time as zero */
+ if (ctr_local->client_pid == GF_CLIENT_PID_TIER_DEFRAG &&
+ (!_priv->ctr_hot_brick) && isdentrycreatefop(ctr_inode_cx->fop_type)) {
+ memset(ctr_wtime, 0, sizeof(*ctr_wtime));
+ }
+
+ /* Copy gfid into db record */
+ gf_uuid_copy(CTR_DB_REC(ctr_local).gfid, *(ctr_inode_cx->gfid));
+
+ /* Copy older gfid if any */
+ if (ctr_inode_cx->old_gfid &&
+ (!gf_uuid_is_null(*(ctr_inode_cx->old_gfid)))) {
+ gf_uuid_copy(CTR_DB_REC(ctr_local).old_gfid, *(ctr_inode_cx->old_gfid));
+ }
+
+ /*Hard Links*/
+ if (isdentryfop(ctr_inode_cx->fop_type)) {
+ /*new link fop*/
+ if (NEW_LINK_CX(ctr_inode_cx)) {
+ gf_uuid_copy(CTR_DB_REC(ctr_local).pargfid,
+ *((NEW_LINK_CX(ctr_inode_cx))->pargfid));
+ strcpy(CTR_DB_REC(ctr_local).file_name,
+ NEW_LINK_CX(ctr_inode_cx)->basename);
+ }
+ /*rename fop*/
+ if (OLD_LINK_CX(ctr_inode_cx)) {
+ gf_uuid_copy(CTR_DB_REC(ctr_local).old_pargfid,
+ *((OLD_LINK_CX(ctr_inode_cx))->pargfid));
+ strcpy(CTR_DB_REC(ctr_local).old_file_name,
+ OLD_LINK_CX(ctr_inode_cx)->basename);
+ }
+ }
+
+ ret = 0;
+out:
+ /*On error roll back and clean the record*/
+ if (ret == -1) {
+ CLEAR_CTR_DB_RECORD(ctr_local);
+ }
+ return ret;
+}
+
+/******************************************************************************
+ *
+ * CTR xlator init related functions
+ *
+ *
+ * ****************************************************************************/
+static int
+extract_sql_params(xlator_t *this, dict_t *params_dict)
+{
+ int ret = -1;
+ char *db_path = NULL;
+ char *db_name = NULL;
+ char *db_full_path = NULL;
+
+ GF_ASSERT(this);
+ GF_ASSERT(params_dict);
+
+ /*Extract the path of the db*/
+ db_path = NULL;
+ GET_DB_PARAM_FROM_DICT_DEFAULT(this->name, this->options, "db-path",
+ db_path, "/var/run/gluster/");
+
+ /*Extract the name of the db*/
+ db_name = NULL;
+ GET_DB_PARAM_FROM_DICT_DEFAULT(this->name, this->options, "db-name",
+ db_name, "gf_ctr_db.db");
+
+ /*Construct full path of the db*/
+ ret = gf_asprintf(&db_full_path, "%s/%s", db_path, db_name);
+ if (ret < 0) {
+ gf_msg(GFDB_DATA_STORE, GF_LOG_ERROR, 0,
+ CTR_MSG_CONSTRUCT_DB_PATH_FAILED,
+ "Construction of full db path failed!");
+ goto out;
+ }
+
+ /*Setting the SQL DB Path*/
+ SET_DB_PARAM_TO_DICT(this->name, params_dict, GFDB_SQL_PARAM_DBPATH,
+ db_full_path, ret, out);
+
+ /*Extract rest of the sql params*/
+ ret = gfdb_set_sql_params(this->name, this->options, params_dict);
+ if (ret) {
+ gf_msg(GFDB_DATA_STORE, GF_LOG_ERROR, 0,
+ CTR_MSG_SET_VALUE_TO_SQL_PARAM_FAILED,
+ "Failed setting values to sql param dict!");
+ }
+
+ ret = 0;
+
+out:
+ if (ret)
+ GF_FREE(db_full_path);
+ return ret;
+}
+
+int
+extract_db_params(xlator_t *this, dict_t *params_dict, gfdb_db_type_t db_type)
+{
+ int ret = -1;
+
+ GF_ASSERT(this);
+ GF_ASSERT(params_dict);
+
+ switch (db_type) {
+ case GFDB_SQLITE3:
+ ret = extract_sql_params(this, params_dict);
+ if (ret)
+ goto out;
+ break;
+ case GFDB_ROCKS_DB:
+ case GFDB_HYPERDEX:
+ case GFDB_HASH_FILE_STORE:
+ case GFDB_INVALID_DB:
+ case GFDB_DB_END:
+ goto out;
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+extract_ctr_options(xlator_t *this, gf_ctr_private_t *_priv)
+{
+ int ret = -1;
+ char *_val_str = NULL;
+
+ GF_ASSERT(this);
+ GF_ASSERT(_priv);
+
+ /*Checking if the CTR Translator is enabled. By default its disabled*/
+ _priv->enabled = _gf_false;
+ GF_OPTION_INIT("ctr-enabled", _priv->enabled, bool, out);
+ if (!_priv->enabled) {
+ gf_msg(GFDB_DATA_STORE, GF_LOG_INFO, 0, CTR_MSG_XLATOR_DISABLED,
+ "CTR Xlator is disabled.");
+ ret = 0;
+ goto out;
+ }
+
+ /*Extract db type*/
+ GF_OPTION_INIT("db-type", _val_str, str, out);
+ _priv->gfdb_db_type = gf_string2gfdbdbtype(_val_str);
+
+ /*Extract flag for record on wind*/
+ GF_OPTION_INIT("record-entry", _priv->ctr_record_wind, bool, out);
+
+ /*Extract flag for record on unwind*/
+ GF_OPTION_INIT("record-exit", _priv->ctr_record_unwind, bool, out);
+
+ /*Extract flag for record on counters*/
+ GF_OPTION_INIT("record-counters", _priv->ctr_record_counter, bool, out);
+
+ /* Extract flag for record metadata heat */
+ GF_OPTION_INIT("ctr-record-metadata-heat", _priv->ctr_record_metadata_heat,
+ bool, out);
+
+ /*Extract flag for link consistency*/
+ GF_OPTION_INIT("ctr_link_consistency", _priv->ctr_link_consistency, bool,
+ out);
+
+ /*Extract ctr_lookupheal_inode_timeout */
+ GF_OPTION_INIT("ctr_lookupheal_inode_timeout",
+ _priv->ctr_lookupheal_inode_timeout, uint64, out);
+
+ /*Extract ctr_lookupheal_link_timeout*/
+ GF_OPTION_INIT("ctr_lookupheal_link_timeout",
+ _priv->ctr_lookupheal_link_timeout, uint64, out);
+
+ /*Extract flag for hot tier brick*/
+ GF_OPTION_INIT("hot-brick", _priv->ctr_hot_brick, bool, out);
+
+ /*Extract flag for sync mode*/
+ GF_OPTION_INIT("db-sync", _val_str, str, out);
+ _priv->gfdb_sync_type = gf_string2gfdbdbsync(_val_str);
+
+ ret = 0;
+
+out:
+ return ret;
+}
diff --git a/xlators/features/changetimerecorder/src/ctr-helper.h b/xlators/features/changetimerecorder/src/ctr-helper.h
new file mode 100644
index 0000000..517fbb0
--- /dev/null
+++ b/xlators/features/changetimerecorder/src/ctr-helper.h
@@ -0,0 +1,854 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CTR_HELPER_H
+#define __CTR_HELPER_H
+
+#include <glusterfs/xlator.h>
+#include "ctr_mem_types.h"
+#include <glusterfs/iatt.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/common-utils.h>
+#include <time.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+#include "gfdb_data_store.h"
+#include "ctr-xlator-ctx.h"
+#include "ctr-messages.h"
+
+#define CTR_DEFAULT_HARDLINK_EXP_PERIOD 300 /* Five mins */
+#define CTR_DEFAULT_INODE_EXP_PERIOD 300 /* Five mins */
+
+typedef struct ctr_query_cbk_args {
+ int query_fd;
+ int count;
+} ctr_query_cbk_args_t;
+
+/*CTR Xlator Private structure*/
+typedef struct gf_ctr_private {
+ gf_boolean_t enabled;
+ char *ctr_db_path;
+ gf_boolean_t ctr_hot_brick;
+ gf_boolean_t ctr_record_wind;
+ gf_boolean_t ctr_record_unwind;
+ gf_boolean_t ctr_record_counter;
+ gf_boolean_t ctr_record_metadata_heat;
+ gf_boolean_t ctr_link_consistency;
+ gfdb_db_type_t gfdb_db_type;
+ gfdb_sync_type_t gfdb_sync_type;
+ gfdb_conn_node_t *_db_conn;
+ uint64_t ctr_lookupheal_link_timeout;
+ uint64_t ctr_lookupheal_inode_timeout;
+ gf_boolean_t compact_active;
+ gf_boolean_t compact_mode_switched;
+ pthread_mutex_t compact_lock;
+} gf_ctr_private_t;
+
+/*
+ * gf_ctr_local_t is the ctr xlator local data structure that is stored in
+ * the call_frame of each FOP.
+ *
+ * gfdb_db_record: The gf_ctr_local contains a gfdb_db_record object, which is
+ * used by the insert_record() api from the libgfdb. The gfdb_db_record object
+ * will contain all the inode and hardlink(only for dentry fops: create,
+ * mknod,link, unlink, rename).The ctr_local is keep alive till the unwind
+ * call and will be release during the unwind. The same gfdb_db_record will
+ * used for the unwind insert_record() api, to record unwind in the database.
+ *
+ * ia_inode_type in gf_ctr_local will tell the type of the inode. This is
+ * important for during the unwind path. As we will not have the inode during
+ * the unwind path. We would have include this in the gfdb_db_record itself
+ * but currently we record only file inode information.
+ *
+ * is_internal_fop in gf_ctr_local will tell us if this is a internal fop and
+ * take special/no action. We don't record change/access times or increement
+ * heat counter for internal fops from rebalancer.
+ * */
+typedef struct gf_ctr_local {
+ gfdb_db_record_t gfdb_db_record;
+ ia_type_t ia_inode_type;
+ gf_boolean_t is_internal_fop;
+ gf_special_pid_t client_pid;
+} gf_ctr_local_t;
+/*
+ * Easy access of gfdb_db_record of ctr_local
+ * */
+#define CTR_DB_REC(ctr_local) (ctr_local->gfdb_db_record)
+
+/*Clear db record*/
+#define CLEAR_CTR_DB_RECORD(ctr_local) \
+ do { \
+ ctr_local->gfdb_db_record.gfdb_fop_path = GFDB_FOP_INVALID; \
+ memset(&(ctr_local->gfdb_db_record.gfdb_wind_change_time), 0, \
+ sizeof(gfdb_time_t)); \
+ memset(&(ctr_local->gfdb_db_record.gfdb_unwind_change_time), 0, \
+ sizeof(gfdb_time_t)); \
+ gf_uuid_clear(ctr_local->gfdb_db_record.gfid); \
+ gf_uuid_clear(ctr_local->gfdb_db_record.pargfid); \
+ memset(ctr_local->gfdb_db_record.file_name, 0, GF_NAME_MAX + 1); \
+ memset(ctr_local->gfdb_db_record.old_file_name, 0, GF_NAME_MAX + 1); \
+ ctr_local->gfdb_db_record.gfdb_fop_type = GFDB_FOP_INVALID_OP; \
+ ctr_local->ia_inode_type = IA_INVAL; \
+ } while (0)
+
+static gf_ctr_local_t *
+init_ctr_local_t(xlator_t *this)
+{
+ gf_ctr_local_t *ctr_local = NULL;
+
+ GF_ASSERT(this);
+
+ ctr_local = mem_get0(this->local_pool);
+ if (!ctr_local) {
+ gf_msg(GFDB_DATA_STORE, GF_LOG_ERROR, 0,
+ CTR_MSG_CREATE_CTR_LOCAL_ERROR_WIND,
+ "Error while creating ctr local");
+ goto out;
+ }
+
+ CLEAR_CTR_DB_RECORD(ctr_local);
+out:
+ return ctr_local;
+}
+
+static void
+free_ctr_local(gf_ctr_local_t *ctr_local)
+{
+ if (ctr_local)
+ mem_put(ctr_local);
+}
+
+/******************************************************************************
+ *
+ *
+ * Context Carrier Structures
+ *
+ *
+ * ****************************************************************************/
+
+/*
+ * Context Carrier structures are used to carry relevant information about
+ * inodes and links from the fops calls to the ctr_insert_wind.
+ * These structure just have pointers to the original data and donot
+ * do a deep copy of any data. This info is deep copied to
+ * ctr_local->gfdb_db_record and passed to insert_record() api of libgfdb. This
+ * info remains persistent for the unwind in ctr_local->gfdb_db_record
+ * and once used will be destroyed.
+ *
+ * gf_ctr_link_context_t : Context structure for hard links
+ * gf_ctr_inode_context_t : Context structure for inodes
+ *
+ * */
+
+/*Context Carrier Structure for hard links*/
+typedef struct gf_ctr_link_context {
+ uuid_t *pargfid;
+ const char *basename;
+} gf_ctr_link_context_t;
+
+/*Context Carrier Structure for inodes*/
+typedef struct gf_ctr_inode_context {
+ ia_type_t ia_type;
+ uuid_t *gfid;
+ uuid_t *old_gfid;
+ gf_ctr_link_context_t *new_link_cx;
+ gf_ctr_link_context_t *old_link_cx;
+ gfdb_fop_type_t fop_type;
+ gfdb_fop_path_t fop_path;
+ gf_boolean_t is_internal_fop;
+ /* Indicating metadata fops */
+ gf_boolean_t is_metadata_fop;
+} gf_ctr_inode_context_t;
+
+/*******************Util Macros for Context Carrier Structures*****************/
+
+/*Checks if ctr_link_cx is sane!*/
+#define IS_CTR_LINK_CX_SANE(ctr_link_cx) \
+ do { \
+ if (ctr_link_cx) { \
+ if (ctr_link_cx->pargfid) \
+ GF_ASSERT(*(ctr_link_cx->pargfid)); \
+ GF_ASSERT(ctr_link_cx->basename); \
+ }; \
+ } while (0)
+
+/*Clear and fill the ctr_link_context with values*/
+#define FILL_CTR_LINK_CX(ctr_link_cx, _pargfid, _basename, label) \
+ do { \
+ GF_VALIDATE_OR_GOTO("ctr", ctr_link_cx, label); \
+ GF_VALIDATE_OR_GOTO("ctr", _pargfid, label); \
+ GF_VALIDATE_OR_GOTO("ctr", _basename, label); \
+ memset(ctr_link_cx, 0, sizeof(*ctr_link_cx)); \
+ ctr_link_cx->pargfid = &_pargfid; \
+ ctr_link_cx->basename = _basename; \
+ } while (0)
+
+#define NEW_LINK_CX(ctr_inode_cx) ctr_inode_cx->new_link_cx
+
+#define OLD_LINK_CX(ctr_inode_cx) ctr_inode_cx->old_link_cx
+
+/*Checks if ctr_inode_cx is sane!*/
+#define IS_CTR_INODE_CX_SANE(ctr_inode_cx) \
+ do { \
+ GF_ASSERT(ctr_inode_cx); \
+ GF_ASSERT(ctr_inode_cx->gfid); \
+ GF_ASSERT(*(ctr_inode_cx->gfid)); \
+ GF_ASSERT(ctr_inode_cx->fop_type != GFDB_FOP_INVALID_OP); \
+ GF_ASSERT(ctr_inode_cx->fop_path != GFDB_FOP_INVALID); \
+ IS_CTR_LINK_CX_SANE(NEW_LINK_CX(ctr_inode_cx)); \
+ IS_CTR_LINK_CX_SANE(OLD_LINK_CX(ctr_inode_cx)); \
+ } while (0)
+
+/*Clear and fill the ctr_inode_context with values*/
+#define FILL_CTR_INODE_CONTEXT(ctr_inode_cx, _ia_type, _gfid, _new_link_cx, \
+ _old_link_cx, _fop_type, _fop_path) \
+ do { \
+ GF_ASSERT(ctr_inode_cx); \
+ GF_ASSERT(_gfid); \
+ GF_ASSERT(_fop_type != GFDB_FOP_INVALID_OP); \
+ GF_ASSERT(_fop_path != GFDB_FOP_INVALID); \
+ memset(ctr_inode_cx, 0, sizeof(*ctr_inode_cx)); \
+ ctr_inode_cx->ia_type = _ia_type; \
+ ctr_inode_cx->gfid = &_gfid; \
+ IS_CTR_LINK_CX_SANE(NEW_LINK_CX(ctr_inode_cx)); \
+ if (_new_link_cx) \
+ NEW_LINK_CX(ctr_inode_cx) = _new_link_cx; \
+ IS_CTR_LINK_CX_SANE(OLD_LINK_CX(ctr_inode_cx)); \
+ if (_old_link_cx) \
+ OLD_LINK_CX(ctr_inode_cx) = _old_link_cx; \
+ ctr_inode_cx->fop_type = _fop_type; \
+ ctr_inode_cx->fop_path = _fop_path; \
+ } while (0)
+
+/******************************************************************************
+ *
+ * Util functions or macros used by
+ * insert wind and insert unwind
+ *
+ * ****************************************************************************/
+/* Free ctr frame local */
+static inline void
+ctr_free_frame_local(call_frame_t *frame)
+{
+ if (frame) {
+ free_ctr_local((gf_ctr_local_t *)frame->local);
+ frame->local = NULL;
+ }
+}
+
+/* Setting GF_REQUEST_LINK_COUNT_XDATA in dict
+ * that has to be sent to POSIX Xlator to send
+ * link count in unwind path.
+ * return 0 for success with not creation of dict
+ * return 1 for success with creation of dict
+ * return -1 for failure.
+ * */
+static inline int
+set_posix_link_request(xlator_t *this, dict_t **xdata)
+{
+ int ret = -1;
+ gf_boolean_t is_created = _gf_false;
+
+ GF_VALIDATE_OR_GOTO("ctr", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, xdata, out);
+
+ /*create xdata if NULL*/
+ if (!*xdata) {
+ *xdata = dict_new();
+ is_created = _gf_true;
+ ret = 1;
+ } else {
+ ret = 0;
+ }
+
+ if (!*xdata) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_XDATA_NULL,
+ "xdata is NULL :Cannot send "
+ "GF_REQUEST_LINK_COUNT_XDATA to posix");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_int32(*xdata, GF_REQUEST_LINK_COUNT_XDATA, 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_SET_CTR_RESPONSE_LINK_COUNT_XDATA_FAILED,
+ "Failed setting GF_REQUEST_LINK_COUNT_XDATA");
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
+out:
+ if (ret == -1) {
+ if (*xdata && is_created) {
+ dict_unref(*xdata);
+ }
+ }
+ return ret;
+}
+
+/*
+ * If a bitrot fop
+ * */
+#define BITROT_FOP(frame) \
+ (frame->root->pid == GF_CLIENT_PID_BITD || \
+ frame->root->pid == GF_CLIENT_PID_SCRUB)
+
+/*
+ * If a rebalancer fop
+ * */
+#define REBALANCE_FOP(frame) (frame->root->pid == GF_CLIENT_PID_DEFRAG)
+
+/*
+ * If its a tiering rebalancer fop
+ * */
+#define TIER_REBALANCE_FOP(frame) \
+ (frame->root->pid == GF_CLIENT_PID_TIER_DEFRAG)
+
+/*
+ * If its a AFR SELF HEAL
+ * */
+#define AFR_SELF_HEAL_FOP(frame) (frame->root->pid == GF_CLIENT_PID_SELF_HEALD)
+
+/*
+ * if a rebalancer fop goto
+ * */
+#define CTR_IF_REBALANCE_FOP_THEN_GOTO(frame, label) \
+ do { \
+ if (REBALANCE_FOP(frame)) \
+ goto label; \
+ } while (0)
+
+/*
+ * Internal fop
+ *
+ * */
+static inline gf_boolean_t
+is_internal_fop(call_frame_t *frame, dict_t *xdata)
+{
+ gf_boolean_t ret = _gf_false;
+
+ GF_ASSERT(frame);
+ GF_ASSERT(frame->root);
+
+ if (AFR_SELF_HEAL_FOP(frame)) {
+ ret = _gf_true;
+ }
+ if (BITROT_FOP(frame)) {
+ ret = _gf_true;
+ }
+ if (REBALANCE_FOP(frame) || TIER_REBALANCE_FOP(frame)) {
+ ret = _gf_true;
+ if (xdata && dict_get(xdata, CTR_ATTACH_TIER_LOOKUP)) {
+ ret = _gf_false;
+ }
+ }
+ if (xdata && dict_get(xdata, GLUSTERFS_INTERNAL_FOP_KEY)) {
+ ret = _gf_true;
+ }
+
+ return ret;
+}
+
+#define CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, dict, label) \
+ do { \
+ if (is_internal_fop(frame, dict)) \
+ goto label; \
+ } while (0)
+
+/* if fop has failed exit */
+#define CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, label) \
+ do { \
+ if (op_ret == -1) { \
+ gf_msg_trace(this->name, 0, "Failed fop with %s", \
+ strerror(op_errno)); \
+ goto label; \
+ }; \
+ } while (0)
+
+/*
+ * IS CTR Xlator is disabled then goto to label
+ * */
+#define CTR_IS_DISABLED_THEN_GOTO(this, label) \
+ do { \
+ gf_ctr_private_t *_priv = NULL; \
+ GF_ASSERT(this); \
+ GF_ASSERT(this->private); \
+ _priv = this->private; \
+ if (!_priv->_db_conn) \
+ goto label; \
+ } while (0)
+
+/*
+ * IS CTR record metadata heat is disabled then goto to label
+ * */
+#define CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, label) \
+ do { \
+ gf_ctr_private_t *_priv = NULL; \
+ GF_ASSERT(this); \
+ GF_ASSERT(this->private); \
+ _priv = this->private; \
+ if (!_priv->ctr_record_metadata_heat) \
+ goto label; \
+ } while (0)
+
+int
+fill_db_record_for_unwind(xlator_t *this, gf_ctr_local_t *ctr_local,
+ gfdb_fop_type_t fop_type, gfdb_fop_path_t fop_path);
+
+int
+fill_db_record_for_wind(xlator_t *this, gf_ctr_local_t *ctr_local,
+ gf_ctr_inode_context_t *ctr_inode_cx);
+
+/*******************************************************************************
+ * CTR INSERT WIND
+ * *****************************************************************************
+ * Function used to insert/update record into the database during a wind fop
+ * This function creates ctr_local structure into the frame of the fop
+ * call.
+ * ****************************************************************************/
+
+static inline int
+ctr_insert_wind(call_frame_t *frame, xlator_t *this,
+ gf_ctr_inode_context_t *ctr_inode_cx)
+{
+ int ret = -1;
+ gf_ctr_private_t *_priv = NULL;
+ gf_ctr_local_t *ctr_local = NULL;
+
+ GF_ASSERT(frame);
+ GF_ASSERT(frame->root);
+ GF_ASSERT(this);
+ IS_CTR_INODE_CX_SANE(ctr_inode_cx);
+
+ _priv = this->private;
+ GF_ASSERT(_priv);
+
+ GF_ASSERT(_priv->_db_conn);
+
+ /*If record_wind option of CTR is on record wind for
+ * regular files only*/
+ if (_priv->ctr_record_wind && ctr_inode_cx->ia_type != IA_IFDIR) {
+ frame->local = init_ctr_local_t(this);
+ if (!frame->local) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_CREATE_CTR_LOCAL_ERROR_WIND,
+ "WIND: Error while creating ctr local");
+ goto out;
+ };
+ ctr_local = frame->local;
+ ctr_local->client_pid = frame->root->pid;
+ ctr_local->is_internal_fop = ctr_inode_cx->is_internal_fop;
+
+ /* Decide whether to record counters or not */
+ CTR_DB_REC(ctr_local).do_record_counters = _gf_false;
+ /* If record counter is enabled */
+ if (_priv->ctr_record_counter) {
+ /* If not a internal fop */
+ if (!(ctr_local->is_internal_fop)) {
+ /* If its a metadata fop AND
+ * record metadata heat
+ * OR
+ * its NOT a metadata fop */
+ if ((ctr_inode_cx->is_metadata_fop &&
+ _priv->ctr_record_metadata_heat) ||
+ (!ctr_inode_cx->is_metadata_fop)) {
+ CTR_DB_REC(ctr_local).do_record_counters = _gf_true;
+ }
+ }
+ }
+
+ /* Decide whether to record times or not
+ * For non internal FOPS record times as usual*/
+ CTR_DB_REC(ctr_local).do_record_times = _gf_false;
+ if (!ctr_local->is_internal_fop) {
+ /* If its a metadata fop AND
+ * record metadata heat
+ * OR
+ * its NOT a metadata fop */
+ if ((ctr_inode_cx->is_metadata_fop &&
+ _priv->ctr_record_metadata_heat) ||
+ (!ctr_inode_cx->is_metadata_fop)) {
+ CTR_DB_REC(ctr_local).do_record_times =
+ (_priv->ctr_record_wind || _priv->ctr_record_unwind);
+ }
+ }
+ /* when its a internal FOPS*/
+ else {
+ /* Record times only for create
+ * i.e when the inode is created */
+ CTR_DB_REC(ctr_local).do_record_times = (isdentrycreatefop(
+ ctr_inode_cx->fop_type))
+ ? _gf_true
+ : _gf_false;
+ }
+
+ /*Fill the db record for insertion*/
+ ret = fill_db_record_for_wind(this, ctr_local, ctr_inode_cx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_FILL_CTR_LOCAL_ERROR_WIND,
+ "WIND: Error filling ctr local");
+ goto out;
+ }
+
+ /*Insert the db record*/
+ ret = insert_record(_priv->_db_conn, &ctr_local->gfdb_db_record);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_INSERT_RECORD_WIND_FAILED,
+ "WIND: Inserting of record failed!");
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+
+ if (ret) {
+ free_ctr_local(ctr_local);
+ frame->local = NULL;
+ }
+
+ return ret;
+}
+
+/*******************************************************************************
+ * CTR INSERT UNWIND
+ * *****************************************************************************
+ * Function used to insert/update record into the database during a unwind fop
+ * This function destroys ctr_local structure into the frame of the fop
+ * call at the end.
+ * ****************************************************************************/
+static inline int
+ctr_insert_unwind(call_frame_t *frame, xlator_t *this, gfdb_fop_type_t fop_type,
+ gfdb_fop_path_t fop_path)
+{
+ int ret = -1;
+ gf_ctr_private_t *_priv = NULL;
+ gf_ctr_local_t *ctr_local = NULL;
+
+ GF_ASSERT(frame);
+ GF_ASSERT(this);
+
+ _priv = this->private;
+ GF_ASSERT(_priv);
+
+ GF_ASSERT(_priv->_db_conn);
+
+ ctr_local = frame->local;
+
+ if (ctr_local && (_priv->ctr_record_unwind || isdentryfop(fop_type)) &&
+ (ctr_local->ia_inode_type != IA_IFDIR)) {
+ CTR_DB_REC(ctr_local).do_record_uwind_time = _priv->ctr_record_unwind;
+
+ ret = fill_db_record_for_unwind(this, ctr_local, fop_type, fop_path);
+ if (ret == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_FILL_CTR_LOCAL_ERROR_UNWIND,
+ "UNWIND: Error filling ctr local");
+ goto out;
+ }
+
+ ret = insert_record(_priv->_db_conn, &ctr_local->gfdb_db_record);
+ if (ret == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_FILL_CTR_LOCAL_ERROR_UNWIND,
+ "UNWIND: Error filling ctr local");
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+/******************************************************************************
+ * Delete file/flink record/s from db
+ * ****************************************************************************/
+static inline int
+ctr_delete_hard_link_from_db(xlator_t *this, uuid_t gfid, uuid_t pargfid,
+ char *basename, gfdb_fop_type_t fop_type,
+ gfdb_fop_path_t fop_path)
+{
+ int ret = -1;
+ gfdb_db_record_t gfdb_db_record;
+ gf_ctr_private_t *_priv = NULL;
+
+ _priv = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, _priv, out);
+ GF_VALIDATE_OR_GOTO(this->name, (!gf_uuid_is_null(gfid)), out);
+ GF_VALIDATE_OR_GOTO(this->name, (!gf_uuid_is_null(pargfid)), out);
+ GF_VALIDATE_OR_GOTO(this->name, (fop_type == GFDB_FOP_DENTRY_WRITE), out);
+ GF_VALIDATE_OR_GOTO(
+ this->name, (fop_path == GFDB_FOP_UNDEL || GFDB_FOP_UNDEL_ALL), out);
+
+ /* Set gfdb_db_record to 0 */
+ memset(&gfdb_db_record, 0, sizeof(gfdb_db_record));
+
+ /* Copy basename */
+ if (snprintf(gfdb_db_record.file_name, GF_NAME_MAX, "%s", basename) >=
+ GF_NAME_MAX)
+ goto out;
+
+ /* Copy gfid into db record */
+ gf_uuid_copy(gfdb_db_record.gfid, gfid);
+
+ /* Copy pargid into db record */
+ gf_uuid_copy(gfdb_db_record.pargfid, pargfid);
+
+ gfdb_db_record.gfdb_fop_path = fop_path;
+ gfdb_db_record.gfdb_fop_type = fop_type;
+
+ /*send delete request to db*/
+ ret = insert_record(_priv->_db_conn, &gfdb_db_record);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_RECORD_WIND_FAILED,
+ "Failed to delete record. %s", basename);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/******************************* Hard link function ***************************/
+
+static inline gf_boolean_t
+__is_inode_expired(ctr_xlator_ctx_t *ctr_xlator_ctx, gf_ctr_private_t *_priv,
+ gfdb_time_t *current_time)
+{
+ gf_boolean_t ret = _gf_false;
+ uint64_t time_diff = 0;
+
+ GF_ASSERT(ctr_xlator_ctx);
+ GF_ASSERT(_priv);
+ GF_ASSERT(current_time);
+
+ time_diff = current_time->tv_sec - ctr_xlator_ctx->inode_heal_period;
+
+ ret = (time_diff >= _priv->ctr_lookupheal_inode_timeout) ? _gf_true
+ : _gf_false;
+ return ret;
+}
+
+static inline gf_boolean_t
+__is_hardlink_expired(ctr_hard_link_t *ctr_hard_link, gf_ctr_private_t *_priv,
+ gfdb_time_t *current_time)
+{
+ gf_boolean_t ret = _gf_false;
+ uint64_t time_diff = 0;
+
+ GF_ASSERT(ctr_hard_link);
+ GF_ASSERT(_priv);
+ GF_ASSERT(current_time);
+
+ time_diff = current_time->tv_sec - ctr_hard_link->hardlink_heal_period;
+
+ ret = ret || (time_diff >= _priv->ctr_lookupheal_link_timeout) ? _gf_true
+ : _gf_false;
+
+ return ret;
+}
+
+/* Return values of heal*/
+typedef enum ctr_heal_ret_val {
+ CTR_CTX_ERROR = -1,
+ /* No healing required */
+ CTR_TRY_NO_HEAL = 0,
+ /* Try healing hard link */
+ CTR_TRY_HARDLINK_HEAL = 1,
+ /* Try healing inode */
+ CTR_TRY_INODE_HEAL = 2,
+} ctr_heal_ret_val_t;
+
+/**
+ * @brief Function to add hard link to the inode context variable.
+ * The inode context maintainences a in-memory list. This is used
+ * smart healing of database.
+ * @param frame of the FOP
+ * @param this is the Xlator instant
+ * @param inode
+ * @return Return ctr_heal_ret_val_t
+ */
+
+static inline ctr_heal_ret_val_t
+add_hard_link_ctx(call_frame_t *frame, xlator_t *this, inode_t *inode)
+{
+ ctr_heal_ret_val_t ret_val = CTR_TRY_NO_HEAL;
+ int ret = -1;
+ gf_ctr_local_t *ctr_local = NULL;
+ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL;
+ ctr_hard_link_t *ctr_hard_link = NULL;
+ gf_ctr_private_t *_priv = NULL;
+ gfdb_time_t current_time = {0};
+
+ GF_ASSERT(frame);
+ GF_ASSERT(this);
+ GF_ASSERT(inode);
+ GF_ASSERT(this->private);
+
+ _priv = this->private;
+
+ ctr_local = frame->local;
+ if (!ctr_local) {
+ goto out;
+ }
+
+ ctr_xlator_ctx = init_ctr_xlator_ctx(this, inode);
+ if (!ctr_xlator_ctx) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_ACCESS_CTR_INODE_CONTEXT_FAILED,
+ "Failed accessing ctr inode context");
+ goto out;
+ }
+
+ LOCK(&ctr_xlator_ctx->lock);
+
+ /* Check if the hard link already exists
+ * in the ctr inode context*/
+ ctr_hard_link = ctr_search_hard_link_ctx(this, ctr_xlator_ctx,
+ CTR_DB_REC(ctr_local).pargfid,
+ CTR_DB_REC(ctr_local).file_name);
+ /* if there then ignore */
+ if (ctr_hard_link) {
+ ret = gettimeofday(&current_time, NULL);
+ if (ret == -1) {
+ gf_log(this->name, GF_LOG_ERROR, "Failed to get current time");
+ ret_val = CTR_CTX_ERROR;
+ goto unlock;
+ }
+
+ if (__is_hardlink_expired(ctr_hard_link, _priv, &current_time)) {
+ ctr_hard_link->hardlink_heal_period = current_time.tv_sec;
+ ret_val = ret_val | CTR_TRY_HARDLINK_HEAL;
+ }
+
+ if (__is_inode_expired(ctr_xlator_ctx, _priv, &current_time)) {
+ ctr_xlator_ctx->inode_heal_period = current_time.tv_sec;
+ ret_val = ret_val | CTR_TRY_INODE_HEAL;
+ }
+
+ goto unlock;
+ }
+
+ /* Add the hard link to the list*/
+ ret = ctr_add_hard_link(this, ctr_xlator_ctx, CTR_DB_REC(ctr_local).pargfid,
+ CTR_DB_REC(ctr_local).file_name);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_ADD_HARDLINK_TO_CTR_INODE_CONTEXT_FAILED,
+ "Failed to add hardlink to the ctr inode context");
+ ret_val = CTR_CTX_ERROR;
+ goto unlock;
+ }
+
+ ret_val = CTR_TRY_NO_HEAL;
+unlock:
+ UNLOCK(&ctr_xlator_ctx->lock);
+out:
+ return ret_val;
+}
+
+static inline int
+delete_hard_link_ctx(call_frame_t *frame, xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL;
+ gf_ctr_local_t *ctr_local = NULL;
+
+ GF_ASSERT(frame);
+ GF_ASSERT(this);
+ GF_ASSERT(inode);
+
+ ctr_local = frame->local;
+ if (!ctr_local) {
+ goto out;
+ }
+
+ ctr_xlator_ctx = get_ctr_xlator_ctx(this, inode);
+ if (!ctr_xlator_ctx) {
+ /* Since there is no ctr inode context so nothing more to do */
+ ret = 0;
+ goto out;
+ }
+
+ ret = ctr_delete_hard_link(this, ctr_xlator_ctx,
+ CTR_DB_REC(ctr_local).pargfid,
+ CTR_DB_REC(ctr_local).file_name);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_DELETE_HARDLINK_FAILED,
+ "Failed to delete hard link");
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+static inline int
+update_hard_link_ctx(call_frame_t *frame, xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL;
+ gf_ctr_local_t *ctr_local = NULL;
+
+ GF_ASSERT(frame);
+ GF_ASSERT(this);
+ GF_ASSERT(inode);
+
+ ctr_local = frame->local;
+ if (!ctr_local) {
+ goto out;
+ }
+
+ ctr_xlator_ctx = init_ctr_xlator_ctx(this, inode);
+ if (!ctr_xlator_ctx) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_ACCESS_CTR_INODE_CONTEXT_FAILED,
+ "Failed accessing ctr inode context");
+ goto out;
+ }
+
+ ret = ctr_update_hard_link(
+ this, ctr_xlator_ctx, CTR_DB_REC(ctr_local).pargfid,
+ CTR_DB_REC(ctr_local).file_name, CTR_DB_REC(ctr_local).old_pargfid,
+ CTR_DB_REC(ctr_local).old_file_name);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_DELETE_HARDLINK_FAILED,
+ "Failed to delete hard link");
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/******************************************************************************
+ *
+ * CTR xlator init related functions
+ *
+ *
+ * ****************************************************************************/
+int
+extract_db_params(xlator_t *this, dict_t *params_dict, gfdb_db_type_t db_type);
+
+int
+extract_ctr_options(xlator_t *this, gf_ctr_private_t *_priv);
+
+#endif
diff --git a/xlators/features/changetimerecorder/src/ctr-messages.h b/xlators/features/changetimerecorder/src/ctr-messages.h
new file mode 100644
index 0000000..23adf0a
--- /dev/null
+++ b/xlators/features/changetimerecorder/src/ctr-messages.h
@@ -0,0 +1,61 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+ */
+
+#ifndef _CTR_MESSAGES_H_
+#define _CTR_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(
+ CTR, CTR_MSG_CREATE_CTR_LOCAL_ERROR_WIND,
+ CTR_MSG_FILL_CTR_LOCAL_ERROR_UNWIND, CTR_MSG_FILL_CTR_LOCAL_ERROR_WIND,
+ CTR_MSG_INSERT_LINK_WIND_FAILED, CTR_MSG_INSERT_WRITEV_WIND_FAILED,
+ CTR_MSG_INSERT_WRITEV_UNWIND_FAILED, CTR_MSG_INSERT_SETATTR_WIND_FAILED,
+ CTR_MSG_INSERT_SETATTR_UNWIND_FAILED,
+ CTR_MSG_INSERT_FREMOVEXATTR_UNWIND_FAILED,
+ CTR_MSG_INSERT_FREMOVEXATTR_WIND_FAILED,
+ CTR_MSG_INSERT_REMOVEXATTR_WIND_FAILED,
+ CTR_MSG_INSERT_REMOVEXATTR_UNWIND_FAILED,
+ CTR_MSG_INSERT_TRUNCATE_WIND_FAILED, CTR_MSG_INSERT_TRUNCATE_UNWIND_FAILED,
+ CTR_MSG_INSERT_FTRUNCATE_UNWIND_FAILED,
+ CTR_MSG_INSERT_FTRUNCATE_WIND_FAILED, CTR_MSG_INSERT_RENAME_WIND_FAILED,
+ CTR_MSG_INSERT_RENAME_UNWIND_FAILED,
+ CTR_MSG_ACCESS_CTR_INODE_CONTEXT_FAILED, CTR_MSG_ADD_HARDLINK_FAILED,
+ CTR_MSG_DELETE_HARDLINK_FAILED, CTR_MSG_UPDATE_HARDLINK_FAILED,
+ CTR_MSG_GET_CTR_RESPONSE_LINK_COUNT_XDATA_FAILED,
+ CTR_MSG_SET_CTR_RESPONSE_LINK_COUNT_XDATA_FAILED,
+ CTR_MSG_INSERT_UNLINK_UNWIND_FAILED, CTR_MSG_INSERT_UNLINK_WIND_FAILED,
+ CTR_MSG_XDATA_NULL, CTR_MSG_INSERT_FSYNC_WIND_FAILED,
+ CTR_MSG_INSERT_FSYNC_UNWIND_FAILED, CTR_MSG_INSERT_MKNOD_UNWIND_FAILED,
+ CTR_MSG_INSERT_MKNOD_WIND_FAILED, CTR_MSG_INSERT_CREATE_WIND_FAILED,
+ CTR_MSG_INSERT_CREATE_UNWIND_FAILED, CTR_MSG_INSERT_RECORD_WIND_FAILED,
+ CTR_MSG_INSERT_READV_WIND_FAILED, CTR_MSG_GET_GFID_FROM_DICT_FAILED,
+ CTR_MSG_SET, CTR_MSG_FATAL_ERROR, CTR_MSG_DANGLING_VOLUME,
+ CTR_MSG_CALLOC_FAILED, CTR_MSG_EXTRACT_CTR_XLATOR_OPTIONS_FAILED,
+ CTR_MSG_INIT_DB_PARAMS_FAILED, CTR_MSG_CREATE_LOCAL_MEMORY_POOL_FAILED,
+ CTR_MSG_MEM_ACC_INIT_FAILED, CTR_MSG_CLOSE_DB_CONN_FAILED,
+ CTR_MSG_FILL_UNWIND_TIME_REC_ERROR, CTR_MSG_WRONG_FOP_PATH,
+ CTR_MSG_CONSTRUCT_DB_PATH_FAILED, CTR_MSG_SET_VALUE_TO_SQL_PARAM_FAILED,
+ CTR_MSG_XLATOR_DISABLED, CTR_MSG_HARDLINK_MISSING_IN_LIST,
+ CTR_MSG_ADD_HARDLINK_TO_LIST_FAILED, CTR_MSG_INIT_LOCK_FAILED,
+ CTR_MSG_COPY_FAILED, CTR_MSG_EXTRACT_DB_PARAM_OPTIONS_FAILED,
+ CTR_MSG_ADD_HARDLINK_TO_CTR_INODE_CONTEXT_FAILED, CTR_MSG_NULL_LOCAL);
+
+#endif /* !_CTR_MESSAGES_H_ */
diff --git a/xlators/features/changetimerecorder/src/ctr-xlator-ctx.c b/xlators/features/changetimerecorder/src/ctr-xlator-ctx.c
new file mode 100644
index 0000000..b6b66d5
--- /dev/null
+++ b/xlators/features/changetimerecorder/src/ctr-xlator-ctx.c
@@ -0,0 +1,362 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "ctr-xlator-ctx.h"
+#include "ctr-messages.h"
+#include <time.h>
+#include <sys/time.h>
+
+#define IS_THE_ONLY_HARDLINK(ctr_hard_link) \
+ (ctr_hard_link->list.next == ctr_hard_link->list.prev)
+
+static void
+fini_ctr_hard_link(ctr_hard_link_t **ctr_hard_link)
+{
+ GF_ASSERT(ctr_hard_link);
+
+ if (*ctr_hard_link)
+ return;
+ GF_FREE((*ctr_hard_link)->base_name);
+ GF_FREE(*ctr_hard_link);
+ *ctr_hard_link = NULL;
+}
+
+/* Please lock the ctr_xlator_ctx before using this function */
+ctr_hard_link_t *
+ctr_search_hard_link_ctx(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx,
+ uuid_t pgfid, const char *base_name)
+{
+ ctr_hard_link_t *_hard_link = NULL;
+ ctr_hard_link_t *searched_hardlink = NULL;
+
+ GF_ASSERT(this);
+ GF_ASSERT(ctr_xlator_ctx);
+
+ if (pgfid == NULL || base_name == NULL)
+ goto out;
+
+ /*linear search*/
+ list_for_each_entry(_hard_link, &ctr_xlator_ctx->hardlink_list, list)
+ {
+ if (gf_uuid_compare(_hard_link->pgfid, pgfid) == 0 &&
+ _hard_link->base_name &&
+ strcmp(_hard_link->base_name, base_name) == 0) {
+ searched_hardlink = _hard_link;
+ break;
+ }
+ }
+
+out:
+ return searched_hardlink;
+}
+
+/* Please lock the ctr_xlator_ctx before using this function */
+int
+ctr_add_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx,
+ uuid_t pgfid, const char *base_name)
+{
+ int ret = -1;
+ ctr_hard_link_t *ctr_hard_link = NULL;
+ struct timeval current_time = {0};
+
+ GF_ASSERT(this);
+ GF_ASSERT(ctr_xlator_ctx);
+
+ if (pgfid == NULL || base_name == NULL)
+ goto out;
+
+ ctr_hard_link = GF_CALLOC(1, sizeof(*ctr_hard_link), gf_ctr_mt_hard_link_t);
+ if (!ctr_hard_link) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, CTR_MSG_CALLOC_FAILED,
+ "Failed allocating "
+ "ctr_hard_link");
+ goto out;
+ }
+
+ /*Initialize the ctr_hard_link object and
+ * Assign the values : parent GFID and basename*/
+ INIT_LIST_HEAD(&ctr_hard_link->list);
+ gf_uuid_copy(ctr_hard_link->pgfid, pgfid);
+ ret = gf_asprintf(&ctr_hard_link->base_name, "%s", base_name);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_COPY_FAILED,
+ "Failed copying basename"
+ "to ctr_hard_link");
+ goto error;
+ }
+
+ ret = gettimeofday(&current_time, NULL);
+ if (ret == -1) {
+ gf_log(this->name, GF_LOG_ERROR, "Failed to get current time");
+ goto error;
+ }
+
+ /*Add the hard link to the list*/
+ list_add_tail(&ctr_hard_link->list, &ctr_xlator_ctx->hardlink_list);
+
+ ctr_hard_link->hardlink_heal_period = current_time.tv_sec;
+
+ /*aal izz well!*/
+ ret = 0;
+ goto out;
+error:
+ GF_FREE(ctr_hard_link);
+out:
+ return ret;
+}
+
+static void
+__delete_hard_link_from_list(ctr_hard_link_t **ctr_hard_link)
+{
+ GF_ASSERT(ctr_hard_link);
+ GF_ASSERT(*ctr_hard_link);
+
+ /*Remove hard link from list*/
+ list_del(&(*ctr_hard_link)->list);
+ fini_ctr_hard_link(ctr_hard_link);
+}
+
+int
+ctr_delete_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx,
+ uuid_t pgfid, const char *base_name)
+{
+ int ret = -1;
+ ctr_hard_link_t *ctr_hard_link = NULL;
+
+ GF_ASSERT(this);
+ GF_ASSERT(ctr_xlator_ctx);
+
+ LOCK(&ctr_xlator_ctx->lock);
+
+ /*Check if the hard link is present */
+ ctr_hard_link = ctr_search_hard_link_ctx(this, ctr_xlator_ctx, pgfid,
+ base_name);
+ if (!ctr_hard_link) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_HARDLINK_MISSING_IN_LIST,
+ "Hard link doesn't exist in the list");
+ goto out;
+ }
+
+ __delete_hard_link_from_list(&ctr_hard_link);
+ ctr_hard_link = NULL;
+
+ ret = 0;
+out:
+ UNLOCK(&ctr_xlator_ctx->lock);
+
+ return ret;
+}
+
+int
+ctr_update_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx,
+ uuid_t pgfid, const char *base_name, uuid_t old_pgfid,
+ const char *old_base_name)
+{
+ int ret = -1;
+ ctr_hard_link_t *ctr_hard_link = NULL;
+ struct timeval current_time = {0};
+
+ GF_ASSERT(this);
+ GF_ASSERT(ctr_xlator_ctx);
+
+ LOCK(&ctr_xlator_ctx->lock);
+
+ /*Check if the hard link is present */
+ ctr_hard_link = ctr_search_hard_link_ctx(this, ctr_xlator_ctx, old_pgfid,
+ old_base_name);
+ if (!ctr_hard_link) {
+ gf_msg_trace(this->name, 0,
+ "Hard link doesn't exist"
+ " in the list");
+ /* Since the hard link is not present in the list
+ * we add it to the list */
+ ret = ctr_add_hard_link(this, ctr_xlator_ctx, pgfid, base_name);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ CTR_MSG_ADD_HARDLINK_TO_LIST_FAILED,
+ "Failed adding hard link to the list");
+ goto out;
+ }
+ ret = 0;
+ goto out;
+ }
+
+ /* update the hard link */
+ gf_uuid_copy(ctr_hard_link->pgfid, pgfid);
+ GF_FREE(ctr_hard_link->base_name);
+ ret = gf_asprintf(&ctr_hard_link->base_name, "%s", base_name);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_COPY_FAILED,
+ "Failed copying basename"
+ "to ctr_hard_link");
+ /* delete the corrupted entry */
+ __delete_hard_link_from_list(&ctr_hard_link);
+ ctr_hard_link = NULL;
+ goto out;
+ }
+
+ ret = gettimeofday(&current_time, NULL);
+ if (ret == -1) {
+ gf_log(this->name, GF_LOG_ERROR, "Failed to get current time");
+ ctr_hard_link->hardlink_heal_period = 0;
+ } else {
+ ctr_hard_link->hardlink_heal_period = current_time.tv_sec;
+ }
+
+ ret = 0;
+
+out:
+ UNLOCK(&ctr_xlator_ctx->lock);
+
+ return ret;
+}
+
+/* Delete all hardlinks */
+static int
+ctr_delete_all_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx)
+{
+ int ret = -1;
+ ctr_hard_link_t *ctr_hard_link = NULL;
+ ctr_hard_link_t *tmp = NULL;
+
+ GF_ASSERT(ctr_xlator_ctx);
+
+ LOCK(&ctr_xlator_ctx->lock);
+
+ list_for_each_entry_safe(ctr_hard_link, tmp, &ctr_xlator_ctx->hardlink_list,
+ list)
+ {
+ /*Remove hard link from list*/
+ __delete_hard_link_from_list(&ctr_hard_link);
+ ctr_hard_link = NULL;
+ }
+
+ UNLOCK(&ctr_xlator_ctx->lock);
+
+ ret = 0;
+
+ return ret;
+}
+
+/* Please lock the inode before using this function */
+static ctr_xlator_ctx_t *
+__get_ctr_xlator_ctx(xlator_t *this, inode_t *inode)
+{
+ int ret = 0;
+ uint64_t _addr = 0;
+ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL;
+
+ GF_ASSERT(this);
+ GF_ASSERT(inode);
+
+ ret = __inode_ctx_get(inode, this, &_addr);
+ if (ret < 0)
+ _addr = 0;
+ if (_addr != 0) {
+ ctr_xlator_ctx = (ctr_xlator_ctx_t *)(long)_addr;
+ }
+
+ return ctr_xlator_ctx;
+}
+
+ctr_xlator_ctx_t *
+init_ctr_xlator_ctx(xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ uint64_t _addr = 0;
+ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL;
+ struct timeval current_time = {0};
+
+ GF_ASSERT(this);
+ GF_ASSERT(inode);
+
+ LOCK(&inode->lock);
+ {
+ ctr_xlator_ctx = __get_ctr_xlator_ctx(this, inode);
+ if (ctr_xlator_ctx) {
+ ret = 0;
+ goto out;
+ }
+ ctr_xlator_ctx = GF_CALLOC(1, sizeof(*ctr_xlator_ctx),
+ gf_ctr_mt_xlator_ctx);
+ if (!ctr_xlator_ctx)
+ goto out;
+
+ ret = LOCK_INIT(&ctr_xlator_ctx->lock);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ret, CTR_MSG_INIT_LOCK_FAILED,
+ "Failed init lock %s", strerror(ret));
+ goto out;
+ }
+ _addr = (uint64_t)(uintptr_t)ctr_xlator_ctx;
+
+ ret = __inode_ctx_set(inode, this, &_addr);
+ if (ret) {
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&ctr_xlator_ctx->hardlink_list);
+
+ ret = gettimeofday(&current_time, NULL);
+ if (ret == -1) {
+ gf_log(this->name, GF_LOG_ERROR, "Failed to get current time");
+ goto out;
+ }
+
+ ctr_xlator_ctx->inode_heal_period = current_time.tv_sec;
+ }
+ ret = 0;
+out:
+ if (ret) {
+ GF_FREE(ctr_xlator_ctx);
+ ctr_xlator_ctx = NULL;
+ }
+
+ UNLOCK(&inode->lock);
+
+ return ctr_xlator_ctx;
+}
+
+void
+fini_ctr_xlator_ctx(xlator_t *this, inode_t *inode)
+{
+ int ret = 0;
+ uint64_t _addr = 0;
+ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL;
+
+ inode_ctx_del(inode, this, &_addr);
+ if (!_addr)
+ return;
+
+ ctr_xlator_ctx = (ctr_xlator_ctx_t *)(long)_addr;
+
+ ret = ctr_delete_all_hard_link(this, ctr_xlator_ctx);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, CTR_MSG_DELETE_HARDLINK_FAILED,
+ "Failed deleting all "
+ "hard links from inode context");
+ }
+
+ LOCK_DESTROY(&ctr_xlator_ctx->lock);
+
+ GF_FREE(ctr_xlator_ctx);
+}
+
+ctr_xlator_ctx_t *
+get_ctr_xlator_ctx(xlator_t *this, inode_t *inode)
+{
+ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL;
+
+ LOCK(&inode->lock);
+ ctr_xlator_ctx = __get_ctr_xlator_ctx(this, inode);
+ UNLOCK(&inode->lock);
+
+ return ctr_xlator_ctx;
+}
diff --git a/xlators/features/changetimerecorder/src/ctr-xlator-ctx.h b/xlators/features/changetimerecorder/src/ctr-xlator-ctx.h
new file mode 100644
index 0000000..4e3bf7e
--- /dev/null
+++ b/xlators/features/changetimerecorder/src/ctr-xlator-ctx.h
@@ -0,0 +1,68 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CTR_XLATOR_CTX_H
+#define __CTR_XLATOR_CTX_H
+
+#include <glusterfs/xlator.h>
+#include "ctr_mem_types.h"
+#include <glusterfs/iatt.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/locking.h>
+#include <glusterfs/common-utils.h>
+#include <time.h>
+#include <sys/time.h>
+
+typedef struct ctr_hard_link {
+ uuid_t pgfid;
+ char *base_name;
+ /* Hardlink expiry : Defines the expiry period after which a
+ * database heal is attempted. */
+ uint64_t hardlink_heal_period;
+ struct list_head list;
+} ctr_hard_link_t;
+
+typedef struct ctr_xlator_ctx {
+ /* This represents the looked up hardlinks
+ * NOTE: This doesn't represent all physical hardlinks of the inode*/
+ struct list_head hardlink_list;
+ uint64_t inode_heal_period;
+ gf_lock_t lock;
+} ctr_xlator_ctx_t;
+
+ctr_hard_link_t *
+ctr_search_hard_link_ctx(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx,
+ uuid_t pgfid, const char *base_name);
+
+int
+ctr_add_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx,
+ uuid_t pgfid, const char *base_name);
+
+int
+ctr_delete_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx,
+ uuid_t pgfid, const char *base_name);
+
+int
+ctr_update_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx,
+ uuid_t pgfid, const char *base_name, uuid_t old_pgfid,
+ const char *old_base_name);
+
+ctr_xlator_ctx_t *
+get_ctr_xlator_ctx(xlator_t *this, inode_t *inode);
+
+ctr_xlator_ctx_t *
+init_ctr_xlator_ctx(xlator_t *this, inode_t *inode);
+
+void
+fini_ctr_xlator_ctx(xlator_t *this, inode_t *inode);
+
+#endif
diff --git a/xlators/features/changetimerecorder/src/ctr_mem_types.h b/xlators/features/changetimerecorder/src/ctr_mem_types.h
new file mode 100644
index 0000000..7b8f531
--- /dev/null
+++ b/xlators/features/changetimerecorder/src/ctr_mem_types.h
@@ -0,0 +1,22 @@
+/*
+ Copyright (c) 2008-2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CTR_MEM_TYPES_H__
+#define __CTR_MEM_TYPES_H__
+
+#include "gfdb_mem-types.h"
+
+enum gf_ctr_mem_types_ {
+ gf_ctr_mt_private_t = gfdb_mt_end + 1,
+ gf_ctr_mt_xlator_ctx,
+ gf_ctr_mt_hard_link_t,
+ gf_ctr_mt_end
+};
+#endif
--
1.8.3.1