import pcp-5.3.5-8.el8

This commit is contained in:
CentOS Sources 2022-05-10 02:59:42 -04:00 committed by Stepan Oksanichenko
parent 4bd3c021d2
commit 41bf8c0c70
17 changed files with 3732 additions and 13538 deletions

2
.gitignore vendored
View File

@ -1 +1 @@
SOURCES/pcp-5.3.1.src.tar.gz
SOURCES/pcp-5.3.5.src.tar.gz

View File

@ -1 +1 @@
5b693868cb11c09b87880ca2d57e672a3bfd63e6 SOURCES/pcp-5.3.1.src.tar.gz
ee69b4a4e33cb5366d30c4c006ba41ccd6386d87 SOURCES/pcp-5.3.5.src.tar.gz

File diff suppressed because it is too large Load Diff

View File

@ -1,849 +0,0 @@
commit 2bad6aef10339f000f7cb578108db5ee80bd640c
Author: Mark Goodwin <mgoodwin@redhat.com>
Date: Wed Jun 9 17:04:33 2021 +1000
pmproxy: add mutex for client req lists, fix https/tls support, QA
Add a new mutext to struct proxy and use it to protect parallel
multithreaded updates to the proxy->first client list.
Also use the same mutext to protect updates to the pending_writes
client list and avoid the doubly linked list corruption that was
causing parallel https/tls requests to get stuck spinning in
flush_secure_module(), as reported in BZ#1947989.
qa/1457 is extensively updated to test parallel http, https/tls
(and combinations of http and https/tls) RESTAPI calls. Previously
it only tested a single https/tls call.
With these changes, parallel https/tls RESTAPI requests from the
grafana-pcp datasource to pmproxy now work correctly whereas previously
pmproxy would hang/spin.
Resolves: RHBZ#1947989 - pmproxy hangs and consume 100% cpu if the
redis datasource is configured with TLS.
Related: https://github.com/performancecopilot/pcp/issues/1311
diff --git a/qa/1457 b/qa/1457
index 94969f6e0..8bf395944 100755
--- a/qa/1457
+++ b/qa/1457
@@ -2,7 +2,7 @@
# PCP QA Test No. 1457
# Exercise HTTPS access to the PMWEBAPI(3).
#
-# Copyright (c) 2019 Red Hat.
+# Copyright (c) 2019,2021 Red Hat.
#
seq=`basename $0`
@@ -138,14 +138,59 @@ else
fi
date >>$seq.full
-echo "=== checking TLS operation ===" | tee -a $seq.full
-# (-k) allows us to use self-signed (insecure) certificates, so for testing only
-# (-v) provides very detailed TLS connection information, for debugging only
-curl -k --get 2>$tmp.err \
- "https://localhost:$port/pmapi/metric?name=sample.long.ten" \
- | _filter_json
-cat $tmp.err >>$seq.full
+echo "=== checking serial http operation ===" | tee -a $seq.full
+for i in 1 2 3 4; do
+ curl -Gs "http://localhost:$port/pmapi/metric?name=sample.long.ten" 2>$tmp.err$i >$tmp.out$i
+done
+for i in 1 2 3 4; do
+echo === out$i === | tee -a $seq.full
+_filter_json < $tmp.out$i
+done
+
+date >>$seq.full
+echo "=== checking parallel http operation ===" | tee -a $seq.full
+for i in 1 2 3 4; do
+ curl -Gs "http://localhost:$port/pmapi/metric?name=sample.long.ten" 2>$tmp.err$i >$tmp.out$i & 2>/dev/null eval pid$i=$!
+done
+wait $pid1 $pid2 $pid3 $pid4
+for i in 1 2 3 4; do
+echo === out$i === | tee -a $seq.full
+_filter_json < $tmp.out$i
+done
+
+date >>$seq.full
+echo "=== checking serial https/TLS operation ===" | tee -a $seq.full
+for i in 1 2 3 4; do
+ curl -k -Gs "https://localhost:$port/pmapi/metric?name=sample.long.ten" 2>$tmp.err$i >$tmp.out$i
+done
+for i in 1 2 3 4; do
+echo === out$i === | tee -a $seq.full
+_filter_json < $tmp.out$i
+done
+
date >>$seq.full
+echo "=== checking parallel https/TLS operation ===" | tee -a $seq.full
+for i in 1 2 3 4; do
+ curl -k -Gs "https://localhost:$port/pmapi/metric?name=sample.long.ten" 2>$tmp.err$i >$tmp.out$i & 2>/dev/null eval pid$i=$!
+done
+wait $pid1 $pid2 $pid3 $pid4
+for i in 1 2 3 4; do
+echo === out$i === | tee -a $seq.full
+_filter_json < $tmp.out$i
+done
+
+date >>$seq.full
+echo "=== checking parallel mixed http and https/TLS operations ===" | tee -a $seq.full
+for i in 1 3 5 7; do
+ j=`expr $i + 1`
+ curl -k -Gs "http://localhost:$port/pmapi/metric?name=sample.long.ten" 2>$tmp.err$i >$tmp.out$i & 2>/dev/null eval pid$i=$!
+ curl -k -Gs "https://localhost:$port/pmapi/metric?name=sample.long.ten" 2>$tmp.err$j >$tmp.out$j & 2>/dev/null eval pid$j=$!
+done
+wait $pid1 $pid2 $pid3 $pid4 $pid5 $pid6 $pid7 $pid8
+for i in 1 2 3 4 5 6 7 8; do
+echo === out$i === | tee -a $seq.full
+_filter_json < $tmp.out$i
+done
echo "=== check pmproxy is running ==="
pminfo -v -h localhost@localhost:$port hinv.ncpu
@@ -156,7 +201,7 @@ else
fi
# valgrind takes awhile to shutdown too
-pmsignal $pid
+pmsignal $pid >/dev/null 2>&1
pmsleep 3.5
echo "=== valgrind stdout ===" | tee -a $seq.full
cat $tmp.valout | _filter_valgrind
@@ -164,6 +209,9 @@ cat $tmp.valout | _filter_valgrind
echo "=== valgrind stderr ===" | tee -a $seq.full
cat $tmp.valerr | _filter_pmproxy_log | _filter_port
+# final kill if it's spinning
+$sudo kill -9 $pid >/dev/null 2>&1
+
# success, all done
status=0
exit
diff --git a/qa/1457.out b/qa/1457.out
index a7b64cdc5..422176db2 100644
--- a/qa/1457.out
+++ b/qa/1457.out
@@ -1,5 +1,539 @@
QA output created by 1457
-=== checking TLS operation ===
+=== checking serial http operation ===
+=== out1 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out2 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out3 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out4 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== checking parallel http operation ===
+=== out1 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out2 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out3 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out4 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== checking serial https/TLS operation ===
+=== out1 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out2 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out3 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out4 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== checking parallel https/TLS operation ===
+=== out1 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out2 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out3 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out4 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== checking parallel mixed http and https/TLS operations ===
+=== out1 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out2 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out3 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out4 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out5 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out6 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out7 ===
+{
+ "context": "CONTEXT"
+ "metrics": [
+ {
+ "name": "sample.long.ten",
+ "series": "SERIES"
+ "pmid": "29.0.11",
+ "type": "32",
+ "sem": "instant",
+ "units": "none",
+ "labels": {
+ "agent": "sample",
+ "cluster": "zero",
+ "domainname": "DOMAINNAME"
+ "hostname": "HOSTNAME"
+ "role": "testing"
+ },
+ "text-oneline": "10 as a 32-bit integer",
+ "text-help": "10 as a 32-bit integer"
+ }
+ ]
+}
+=== out8 ===
{
"context": "CONTEXT"
"metrics": [
diff --git a/qa/group b/qa/group
index 462dffaaa..77cac788d 100644
--- a/qa/group
+++ b/qa/group
@@ -1818,7 +1818,7 @@ x11
1436 pmda.postgresql local
1437 pmda.kvm local
1455 pmlogrewrite labels pmdumplog local
-1457 pmproxy local
+1457 pmproxy libpcp_web threads secure local
1480 pmda.lmsensors local
1489 python pmrep pmimport local
1490 python local labels
diff --git a/src/pmproxy/src/secure.c b/src/pmproxy/src/secure.c
index 77265894c..072e2a085 100644
--- a/src/pmproxy/src/secure.c
+++ b/src/pmproxy/src/secure.c
@@ -16,13 +16,25 @@
#include <openssl/opensslv.h>
#include <openssl/ssl.h>
+/* called with proxy->mutex locked */
static void
remove_connection_from_queue(struct client *client)
{
+ struct proxy *proxy = client->proxy;
+
if (client->secure.pending.writes_buffer != NULL)
free(client->secure.pending.writes_buffer);
- if (client->secure.pending.prev != NULL)
- *client->secure.pending.prev = client->secure.pending.next;
+ if (client->secure.pending.prev == NULL) {
+ /* next (if any) becomes first in pending_writes list */
+ proxy->pending_writes = client->secure.pending.next;
+ if (proxy->pending_writes)
+ proxy->pending_writes->secure.pending.prev = NULL;
+ }
+ else {
+ /* link next and prev */
+ client->secure.pending.prev->secure.pending.next = client->secure.pending.next;
+ client->secure.pending.next->secure.pending.prev = client->secure.pending.prev;
+ }
memset(&client->secure.pending, 0, sizeof(client->secure.pending));
}
@@ -32,7 +44,9 @@ on_secure_client_close(struct client *client)
if (pmDebugOptions.auth || pmDebugOptions.http)
fprintf(stderr, "%s: client %p\n", "on_secure_client_close", client);
+ uv_mutex_lock(&client->proxy->mutex);
remove_connection_from_queue(client);
+ uv_mutex_unlock(&client->proxy->mutex);
/* client->read and client->write freed by SSL_free */
SSL_free(client->secure.ssl);
}
@@ -40,6 +54,8 @@ on_secure_client_close(struct client *client)
static void
maybe_flush_ssl(struct proxy *proxy, struct client *client)
{
+ struct client *c;
+
if (client->secure.pending.queued)
return;
@@ -47,13 +63,19 @@ maybe_flush_ssl(struct proxy *proxy, struct client *client)
client->secure.pending.writes_count > 0)
return;
- client->secure.pending.next = proxy->pending_writes;
- if (client->secure.pending.next != NULL)
- client->secure.pending.next->secure.pending.prev = &client->secure.pending.next;
- client->secure.pending.prev = &proxy->pending_writes;
+ uv_mutex_lock(&proxy->mutex);
+ if (proxy->pending_writes == NULL) {
+ proxy->pending_writes = client;
+ client->secure.pending.prev = client->secure.pending.next = NULL;
+ }
+ else {
+ for (c=proxy->pending_writes; c->secure.pending.next; c = c->secure.pending.next)
+ ; /**/
+ c->secure.pending.next = client;
+ client->secure.pending.prev = c;
+ }
client->secure.pending.queued = 1;
-
- proxy->pending_writes = client;
+ uv_mutex_unlock(&proxy->mutex);
}
static void
@@ -135,10 +157,12 @@ flush_ssl_buffer(struct client *client)
void
flush_secure_module(struct proxy *proxy)
{
- struct client *client, **head = &proxy->pending_writes;
+ struct client *client, **head;
size_t i, used;
int sts;
+ uv_mutex_lock(&proxy->mutex);
+ head = &proxy->pending_writes;
while ((client = *head) != NULL) {
flush_ssl_buffer(client);
@@ -188,6 +212,7 @@ flush_secure_module(struct proxy *proxy)
sizeof(uv_buf_t) * client->secure.pending.writes_count);
}
}
+ uv_mutex_unlock(&proxy->mutex);
}
void
diff --git a/src/pmproxy/src/server.c b/src/pmproxy/src/server.c
index 612d3613b..b5c5d84de 100644
--- a/src/pmproxy/src/server.c
+++ b/src/pmproxy/src/server.c
@@ -149,6 +149,7 @@ server_init(int portcount, const char *localpath)
pmGetProgname());
return NULL;
}
+ uv_mutex_init(&proxy->mutex);
count = portcount + (*localpath ? 1 : 0);
if (count) {
@@ -251,6 +252,7 @@ void
client_put(struct client *client)
{
unsigned int refcount;
+ struct proxy *proxy = client->proxy;
uv_mutex_lock(&client->mutex);
assert(client->refcount);
@@ -259,9 +261,11 @@ client_put(struct client *client)
if (refcount == 0) {
/* remove client from the doubly-linked list */
+ uv_mutex_lock(&proxy->mutex);
if (client->next != NULL)
client->next->prev = client->prev;
*client->prev = client->next;
+ uv_mutex_unlock(&proxy->mutex);
if (client->protocol & STREAM_PCP)
on_pcp_client_close(client);
@@ -514,10 +518,12 @@ on_client_connection(uv_stream_t *stream, int status)
client->proxy = proxy;
/* insert client into doubly-linked list at the head */
+ uv_mutex_lock(&proxy->mutex);
if ((client->next = proxy->first) != NULL)
proxy->first->prev = &client->next;
proxy->first = client;
client->prev = &proxy->first;
+ uv_mutex_unlock(&proxy->mutex);
status = uv_read_start((uv_stream_t *)&client->stream.u.tcp,
on_buffer_alloc, on_client_read);
diff --git a/src/pmproxy/src/server.h b/src/pmproxy/src/server.h
index f0b7a5f68..f93daeff4 100644
--- a/src/pmproxy/src/server.h
+++ b/src/pmproxy/src/server.h
@@ -118,7 +118,7 @@ typedef struct secure_client {
BIO *write;
struct {
struct client *next;
- struct client **prev;
+ struct client *prev;
unsigned int queued;
size_t writes_count;
uv_buf_t *writes_buffer;
@@ -166,6 +166,7 @@ typedef struct proxy {
struct dict *config; /* configuration dictionary */
uv_loop_t *events; /* global, async event loop */
uv_callback_t write_callbacks;
+ uv_mutex_t mutex; /* protects client lists and pending writes */
} proxy;
extern void proxylog(pmLogLevel, sds, void *);

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,193 @@
commit 6b907e90c74fce82d6b712493d8b362bdd1a1ec1
Author: Nathan Scott <nathans@redhat.com>
Date: Tue Dec 14 08:54:14 2021 +1100
pmlogconf: switch to the bulk pmLookupDescs(3) interface
No functional change, all existing regression tests pass.
Related to Red Hat BZ #1973833.
diff --git a/src/pmlogconf/pmlogconf.c b/src/pmlogconf/pmlogconf.c
index fa1156859d..ef4fc08bbd 100644
--- a/src/pmlogconf/pmlogconf.c
+++ b/src/pmlogconf/pmlogconf.c
@@ -473,13 +473,19 @@ fetch_groups(void)
{
static pmResult *result;
const char **names;
+ pmDesc *descs;
pmID *pmids;
- int i, n, sts;
+ int i, n, sts, count;
- /* prepare arrays of names and identifiers for PMAPI metric lookup */
+ /* prepare arrays of names, descriptors and IDs for PMAPI metric lookup */
if ((names = calloc(ngroups, sizeof(char *))) == NULL)
return -ENOMEM;
+ if ((descs = calloc(ngroups, sizeof(pmDesc))) == NULL) {
+ free(names);
+ return -ENOMEM;
+ }
if ((pmids = calloc(ngroups, sizeof(pmID))) == NULL) {
+ free(descs);
free(names);
return -ENOMEM;
}
@@ -490,15 +496,16 @@ fetch_groups(void)
continue;
names[n++] = (const char *)groups[i].metric;
}
+ count = n;
- if ((sts = pmLookupName(n, names, pmids)) < 0) {
- if (n == 1)
+ if ((sts = pmLookupName(count, names, pmids)) < 0) {
+ if (count == 1)
groups[0].pmid = PM_ID_NULL;
else
fprintf(stderr, "%s: cannot lookup metric names: %s\n",
pmGetProgname(), pmErrStr(sts));
}
- else if ((sts = pmFetch(n, pmids, &result)) < 0) {
+ else if ((sts = pmFetch(count, pmids, &result)) < 0) {
fprintf(stderr, "%s: cannot fetch metric values: %s\n",
pmGetProgname(), pmErrStr(sts));
}
@@ -510,6 +517,13 @@ fetch_groups(void)
else
groups[i].pmid = pmids[n++];
}
+ /* descriptor lookup, descs_hash handles failure here */
+ (void) pmLookupDescs(count, pmids, descs);
+
+ /* create a hash over the descs for quick PMID lookup */
+ if ((sts = descs_hash(count, descs)) < 0)
+ fprintf(stderr, "%s: cannot hash metric descs: %s\n",
+ pmGetProgname(), pmErrStr(sts));
/* create a hash over the result for quick PMID lookup */
if ((sts = values_hash(result)) < 0)
fprintf(stderr, "%s: cannot hash metric values: %s\n",
@@ -806,14 +820,16 @@ evaluate_string_regexp(group_t *group, regex_cmp_t compare)
int i, found;
pmValueSet *vsp;
pmValue *vp;
+ pmDesc *dp;
pmAtomValue atom;
regex_t regex;
int sts, type;
- if ((vsp = metric_values(group->pmid)) == NULL)
+ if ((vsp = metric_values(group->pmid)) == NULL ||
+ (dp = metric_desc(group->pmid)) == NULL)
return 0;
- type = metric_type(group->pmid);
+ type = dp->type;
if (type < 0 || type > PM_TYPE_STRING) {
fprintf(stderr, "%s: %s uses regular expression on non-scalar metric\n",
pmGetProgname(), group->tag);
@@ -849,11 +865,14 @@ evaluate_string_regexp(group_t *group, regex_cmp_t compare)
static int
evaluate_values(group_t *group, numeric_cmp_t ncmp, string_cmp_t scmp)
{
- int type = metric_type(group->pmid);
+ pmDesc *dp;
+
+ if ((dp = metric_desc(group->pmid)) == NULL)
+ return 0;
- if (type == PM_TYPE_STRING)
+ if (dp->type == PM_TYPE_STRING)
return evaluate_string_values(group, scmp);
- return evaluate_number_values(group, type, ncmp);
+ return evaluate_number_values(group, dp->type, ncmp);
}
int
diff --git a/src/pmlogconf/util.c b/src/pmlogconf/util.c
index d44c2e529a..293eb2eca3 100644
--- a/src/pmlogconf/util.c
+++ b/src/pmlogconf/util.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Red Hat. All Rights Reserved.
+ * Copyright (c) 2020-2021 Red Hat. All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
@@ -14,7 +14,7 @@
#include "util.h"
static __pmHashCtl valuesctl; /* pointers to values in pmResult */
-static __pmHashCtl typesctl; /* metric types from pmLookupDesc */
+static __pmHashCtl descsctl; /* metric descs from pmLookupDesc */
int
values_hash(pmResult *result)
@@ -47,27 +47,33 @@ metric_values(pmID pmid)
}
int
-metric_type(pmID pmid)
+descs_hash(int numpmid, pmDesc *descs)
{
- __pmHashNode *node;
- pmDesc desc;
- int sts, *data;
+ unsigned int i;
+ pmDesc *dp;
+ int sts;
- if (pmid == PM_IN_NULL)
- return PM_TYPE_UNKNOWN;
- if ((node = __pmHashSearch(pmid, &typesctl)) == NULL) {
- if ((sts = pmLookupDesc(pmid, &desc)) < 0)
- return sts;
- if ((data = malloc(sizeof(int))) == NULL)
- return sts;
- *data = desc.type;
- if ((sts = __pmHashAdd(pmid, data, &typesctl)) < 0) {
- free(data);
+ if ((sts = __pmHashPreAlloc(numpmid, &descsctl)) < 0)
+ return sts;
+
+ for (i = 0; i < numpmid; i++) {
+ dp = &descs[i];
+ if ((sts = __pmHashAdd(dp->pmid, dp, &descsctl)) < 0)
return sts;
- }
- return *data;
}
- return *(int *)node->data;
+ return numpmid;
+}
+
+pmDesc *
+metric_desc(pmID pmid)
+{
+ __pmHashNode *node;
+
+ if (pmid == PM_IN_NULL)
+ return NULL;
+ if ((node = __pmHashSearch(pmid, &descsctl)) == NULL)
+ return NULL;
+ return (pmDesc *)node->data;
}
int
diff --git a/src/pmlogconf/util.h b/src/pmlogconf/util.h
index 17d856a0d7..a11350d899 100644
--- a/src/pmlogconf/util.h
+++ b/src/pmlogconf/util.h
@@ -34,7 +34,9 @@ extern void fmt(const char *, char *, size_t, int, int, fmt_t, void *);
extern int values_hash(pmResult *);
extern pmValueSet *metric_values(pmID);
-extern int metric_type(pmID);
+
+extern int descs_hash(int, pmDesc *);
+extern pmDesc *metric_desc(pmID);
typedef int (*numeric_cmp_t)(double, double);
extern int number_equal(double, double);

File diff suppressed because it is too large Load Diff

View File

@ -1,922 +0,0 @@
3f5ba2218 libpcp_web: add mutex to struct webgroup protecting the context dict
107633192 src/libpcp: be more careful when calling __pmLogChangeVol()
49bdfdfff libpcp: redefine __pmLogSetTime()
5e3b792d3 libpcp_web: plug mem leak in redisMapInsert during daily log-rolling
2a00a90b0 libpcp_web/discovery: improve lock handling and scalability
commit 3f5ba221842e6a02e9fb22e23c754854271c3c9a
Author: Mark Goodwin <mgoodwin@redhat.com>
Date: Wed Jun 9 16:44:30 2021 +1000
libpcp_web: add mutex to struct webgroup protecting the context dict
Add a mutex to the local webgroups structure in libpcp_web and
use it to protect multithreaded parallel updates (dictAdd,
dictDelete) to the groups->contexts dict and the dict traversal
in the timer driven garbage collector.
Tested by qa/297 and related tests and also an updated version
of qa/1457 (which now stress tests parallel http and https/tls
pmproxy RESTAPI calls .. in a later commit).
Related: RHBZ#1947989
Resolves: https://github.com/performancecopilot/pcp/issues/1311
diff --git a/src/libpcp_web/src/webgroup.c b/src/libpcp_web/src/webgroup.c
index 08c2518ed..35f05441b 100644
--- a/src/libpcp_web/src/webgroup.c
+++ b/src/libpcp_web/src/webgroup.c
@@ -51,14 +51,20 @@ typedef struct webgroups {
uv_loop_t *events;
unsigned int active;
uv_timer_t timer;
+ uv_mutex_t mutex;
} webgroups;
static struct webgroups *
webgroups_lookup(pmWebGroupModule *module)
{
- if (module->privdata == NULL)
+ struct webgroups *groups = module->privdata;
+
+ if (module->privdata == NULL) {
module->privdata = calloc(1, sizeof(struct webgroups));
- return (struct webgroups *)module->privdata;
+ groups = (struct webgroups *)module->privdata;
+ uv_mutex_init(&groups->mutex);
+ }
+ return groups;
}
static int
@@ -94,8 +100,11 @@ webgroup_drop_context(struct context *context, struct webgroups *groups)
context->garbage = 1;
uv_timer_stop(&context->timer);
}
- if (groups)
+ if (groups) {
+ uv_mutex_lock(&groups->mutex);
dictDelete(groups->contexts, &context->randomid);
+ uv_mutex_unlock(&groups->mutex);
+ }
uv_close((uv_handle_t *)&context->timer, webgroup_release_context);
}
}
@@ -207,13 +216,16 @@ webgroup_new_context(pmWebGroupSettings *sp, dict *params,
cp->context = -1;
cp->timeout = polltime;
+ uv_mutex_lock(&groups->mutex);
if ((cp->randomid = random()) < 0 ||
dictFind(groups->contexts, &cp->randomid) != NULL) {
infofmt(*message, "random number failure on new web context");
pmwebapi_free_context(cp);
*status = -ESRCH;
+ uv_mutex_unlock(&groups->mutex);
return NULL;
}
+ uv_mutex_unlock(&groups->mutex);
cp->origin = sdscatfmt(sdsempty(), "%i", cp->randomid);
cp->name.sds = sdsdup(hostspec ? hostspec : LOCALHOST);
cp->realm = sdscatfmt(sdsempty(), "pmapi/%i", cp->randomid);
@@ -242,7 +254,9 @@ webgroup_new_context(pmWebGroupSettings *sp, dict *params,
pmwebapi_free_context(cp);
return NULL;
}
+ uv_mutex_lock(&groups->mutex);
dictAdd(groups->contexts, &cp->randomid, cp);
+ uv_mutex_unlock(&groups->mutex);
/* leave until the end because uv_timer_init makes this visible in uv_run */
handle = (uv_handle_t *)&cp->timer;
@@ -261,25 +275,34 @@ webgroup_new_context(pmWebGroupSettings *sp, dict *params,
static void
webgroup_garbage_collect(struct webgroups *groups)
{
- dictIterator *iterator = dictGetSafeIterator(groups->contexts);
+ dictIterator *iterator;
dictEntry *entry;
context_t *cp;
if (pmDebugOptions.http || pmDebugOptions.libweb)
fprintf(stderr, "%s: started\n", "webgroup_garbage_collect");
- while ((entry = dictNext(iterator)) != NULL) {
- cp = (context_t *)dictGetVal(entry);
- if (cp->garbage && cp->privdata == groups) {
- if (pmDebugOptions.http || pmDebugOptions.libweb)
- fprintf(stderr, "GC context %u (%p)\n", cp->randomid, cp);
- webgroup_drop_context(cp, groups);
+ /* do context GC if we get the lock (else don't block here) */
+ if (uv_mutex_trylock(&groups->mutex) == 0) {
+ iterator = dictGetSafeIterator(groups->contexts);
+ for (entry = dictNext(iterator); entry;) {
+ cp = (context_t *)dictGetVal(entry);
+ entry = dictNext(iterator);
+ if (cp->garbage && cp->privdata == groups) {
+ if (pmDebugOptions.http || pmDebugOptions.libweb)
+ fprintf(stderr, "GC context %u (%p)\n", cp->randomid, cp);
+ uv_mutex_unlock(&groups->mutex);
+ webgroup_drop_context(cp, groups);
+ uv_mutex_lock(&groups->mutex);
+ }
}
+ dictReleaseIterator(iterator);
+ uv_mutex_unlock(&groups->mutex);
}
- dictReleaseIterator(iterator);
/* TODO - trim maps, particularly instmap if proc metrics are not excluded */
+ /* TODO move the following to a new stats timer */
if (groups->metrics_handle) {
mmv_stats_set(groups->metrics_handle, "contextmap.size",
NULL, dictSize(contextmap));
commit 107633192326b27ae571d4d4955052b8d86222c2
Author: Ken McDonell <kenj@kenj.id.au>
Date: Fri Jul 2 16:52:48 2021 +1000
src/libpcp: be more careful when calling __pmLogChangeVol()
Mark observed a SEGV which looks like __pmLogFetch() died because
ctxp->c_archctl->ac_mfp was (unexpectedly) NULL.
See: https://github.com/performancecopilot/pcp/issues/1338
Initial guess is that a physical file was removed by concurrent
activity (like pmlogger_check or pmlogger_daily), causing
__pmLogChangeVol() to fail ... and this was not being checked for
on the __pmLogFetch() path and in a couple of other places.
modified: interp.c
modified: logutil.c
diff --git a/src/libpcp/src/interp.c b/src/libpcp/src/interp.c
index d7effbc1e..c8f6fe382 100644
--- a/src/libpcp/src/interp.c
+++ b/src/libpcp/src/interp.c
@@ -1312,7 +1312,9 @@ __pmLogFetchInterp(__pmContext *ctxp, int numpmid, pmID pmidlist[], pmResult **r
}
/* get to the last remembered place */
- __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
+ sts = __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
+ if (sts < 0)
+ goto all_done;
__pmFseek(ctxp->c_archctl->ac_mfp, ctxp->c_archctl->ac_offset, SEEK_SET);
seen_mark = 0; /* interested in <mark> records seen from here on */
@@ -1397,7 +1399,9 @@ __pmLogFetchInterp(__pmContext *ctxp, int numpmid, pmID pmidlist[], pmResult **r
* at least one metric requires a bound from earlier in the log ...
* position ourselves, ... and search
*/
- __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
+ sts = __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
+ if (sts < 0)
+ goto all_done;
__pmFseek(ctxp->c_archctl->ac_mfp, ctxp->c_archctl->ac_offset, SEEK_SET);
done = 0;
@@ -1542,7 +1546,9 @@ __pmLogFetchInterp(__pmContext *ctxp, int numpmid, pmID pmidlist[], pmResult **r
* at least one metric requires a bound from later in the log ...
* position ourselves ... and search
*/
- __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
+ sts = __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
+ if (sts < 0)
+ goto all_done;
__pmFseek(ctxp->c_archctl->ac_mfp, ctxp->c_archctl->ac_offset, SEEK_SET);
done = 0;
diff --git a/src/libpcp/src/logutil.c b/src/libpcp/src/logutil.c
index fe35ed422..0ef76de25 100644
--- a/src/libpcp/src/logutil.c
+++ b/src/libpcp/src/logutil.c
@@ -1992,7 +1992,10 @@ __pmLogFetch(__pmContext *ctxp, int numpmid, pmID pmidlist[], pmResult **result)
all_derived = check_all_derived(numpmid, pmidlist);
/* re-establish position */
- __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
+ sts = __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
+ if (sts < 0)
+ goto func_return;
+ assert(ctxp->c_archctl->ac_mfp != NULL);
__pmFseek(ctxp->c_archctl->ac_mfp,
(long)ctxp->c_archctl->ac_offset, SEEK_SET);
@@ -2489,10 +2492,12 @@ __pmLogSetTime(__pmContext *ctxp)
/* index either not available, or not useful */
if (mode == PM_MODE_FORW) {
__pmLogChangeVol(acp, lcp->l_minvol);
+ assert(acp->ac_mfp != NULL);
__pmFseek(acp->ac_mfp, (long)(sizeof(__pmLogLabel) + 2*sizeof(int)), SEEK_SET);
}
else if (mode == PM_MODE_BACK) {
__pmLogChangeVol(acp, lcp->l_maxvol);
+ assert(acp->ac_mfp != NULL);
__pmFseek(acp->ac_mfp, (long)0, SEEK_END);
}
@@ -3141,6 +3146,7 @@ LogChangeToPreviousArchive(__pmContext *ctxp)
/* Set up to scan backwards from the end of the archive. */
__pmLogChangeVol(acp, lcp->l_maxvol);
+ assert(acp->ac_mfp != NULL);
__pmFseek(acp->ac_mfp, (long)0, SEEK_END);
ctxp->c_archctl->ac_offset = __pmFtell(acp->ac_mfp);
assert(ctxp->c_archctl->ac_offset >= 0);
commit 49bdfdfff83ac165de2bdc9a40e61a56512585d8
Author: Ken McDonell <kenj@kenj.id.au>
Date: Sun Jul 4 10:07:09 2021 +1000
libpcp: redefine __pmLogSetTime()
The problem is that if physical files for the data volumes of an
archive are removed (asynchronously by someone else) while we're
trying to switch volumes then we don't handle this safely.
The previous commit 10763319 was as stop-gap to address Mark's SEGV
issue at https://github.com/performancecopilot/pcp/issues/1338 and
simply handled direct calls to __pmLogChangeVol() and ensured the
return status was checked.
I was aware, then Coverity made a lot more people aware, that this
"fix" was incomplete, specifically the calls to __pmLogChangeVol()
from within __pmLogSetTime() were not checked.
To fix the latter we have to change the type of __pmLogSetTime() from
void to int so we can return status to indicate that __pmLogChangeVol()
has failed. And then make sure all the callers of __pmLogSetTime()
check the status returned from that function.
modified: src/libpcp/src/fetch.c
modified: src/libpcp/src/internal.h
modified: src/libpcp/src/logutil.c
Because this introduces some new -Dlog diagnostics, qa/251 needed
a bit of a make-over.
diff --git a/qa/251 b/qa/251
index 2b8a07917..f9b293e98 100755
--- a/qa/251
+++ b/qa/251
@@ -37,7 +37,7 @@ _filter()
status=1 # failure is the default!
$sudo rm -rf $tmp.* $seq.full
-trap "cd $here; rm -rf $tmp; exit \$status" 0 1 2 3 15
+trap "cd $here; rm -rf $tmp $tmp.*; exit \$status" 0 1 2 3 15
# real QA test starts here
mkdir $tmp
@@ -50,56 +50,62 @@ cd $tmp
for inst in "bin-100" "bin-100,bin-500,bin-900"
do
echo
- echo "All volumes present ... $inst ..."
- pmval -z -O $offset -D128 -t2 -a ok-mv-bar -i $inst sampledso.bin 2>err >out
- egrep 'Skip|Change' err
- _filter <out
+ echo "All volumes present ... $inst ..." | tee -a $here/$seq.full
+ pmval -z -O $offset -Dlog -t2 -a ok-mv-bar -i $inst sampledso.bin 2>$tmp.err >$tmp.out
+ cat $tmp.err >>$here/$seq.full
+ grep '^__pmLogChangeVol:' $tmp.err
+ _filter <$tmp.out
[ -f die ] && exit
echo
- echo "First volume missing ... $inst ..."
+ echo "First volume missing ... $inst ..." | tee -a $here/$seq.full
mv ok-mv-bar.0 foo.0
- pmval -z -O $offset -D128 -t2 -a ok-mv-bar -i $inst sampledso.bin 2>err >out
- egrep 'Skip|Change' err
- _filter <out
+ pmval -z -O $offset -Dlog -t2 -a ok-mv-bar -i $inst sampledso.bin 2>$tmp.err >$tmp.out
+ cat $tmp.err >>$here/$seq.full
+ grep '^__pmLogChangeVol:' $tmp.err
+ _filter <$tmp.out
[ -f die ] && exit
mv foo.0 ok-mv-bar.0
echo
- echo "Last volume missing ... $inst ..."
+ echo "Last volume missing ... $inst ..." | tee -a $here/$seq.full
mv ok-mv-bar.3 foo.3
- pmval -z -O $offset -D128 -t2 -a ok-mv-bar -i $inst sampledso.bin 2>err >out
- egrep 'Skip|Change' err
- _filter <out
+ pmval -z -O $offset -Dlog -t2 -a ok-mv-bar -i $inst sampledso.bin 2>$tmp.err >$tmp.out
+ cat $tmp.err >>$here/$seq.full
+ grep '^__pmLogChangeVol:' $tmp.err
+ _filter <$tmp.out
[ -f die ] && exit
mv foo.3 ok-mv-bar.3
echo
- echo "Second volume missing ... $inst ..."
+ echo "Second volume missing ... $inst ..." | tee -a $here/$seq.full
mv ok-mv-bar.1 foo.1
- pmval -z -O $offset -D128 -t2 -a ok-mv-bar -i $inst sampledso.bin 2>err >out
- egrep 'Skip|Change' err
- _filter <out
+ pmval -z -O $offset -Dlog -t2 -a ok-mv-bar -i $inst sampledso.bin 2>$tmp.err >$tmp.out
+ cat $tmp.err >>$here/$seq.full
+ grep '^__pmLogChangeVol:' $tmp.err
+ _filter <$tmp.out
[ -f die ] && exit
mv foo.1 ok-mv-bar.1
echo
- echo "Second last volume missing ... $inst ..."
+ echo "Second last volume missing ... $inst ..." | tee -a $here/$seq.full
mv ok-mv-bar.2 foo.2
- pmval -z -O $offset -D128 -t2 -a ok-mv-bar -i $inst sampledso.bin 2>err >out
- egrep 'Skip|Change' err
- _filter <out
+ pmval -z -O $offset -Dlog -t2 -a ok-mv-bar -i $inst sampledso.bin 2>$tmp.err >$tmp.out
+ cat $tmp.err >>$here/$seq.full
+ grep '^__pmLogChangeVol:' $tmp.err
+ _filter <$tmp.out
[ -f die ] && exit
mv foo.2 ok-mv-bar.2
echo
- echo "All volumes but second missing ... $inst ..."
+ echo "All volumes but second missing ... $inst ..." | tee -a $here/$seq.full
mv ok-mv-bar.0 foo.0
mv ok-mv-bar.2 foo.2
mv ok-mv-bar.3 foo.3
- pmval -z -O $offset -D128 -t2 -a ok-mv-bar -i $inst sampledso.bin 2>err >out
- egrep 'Skip|Change' err
- _filter <out
+ pmval -z -O $offset -Dlog -t2 -a ok-mv-bar -i $inst sampledso.bin 2>$tmp.err >$tmp.out
+ cat $tmp.err >>$here/$seq.full
+ grep '^__pmLogChangeVol:' $tmp.err
+ _filter <$tmp.out
[ -f die ] && exit
mv foo.0 ok-mv-bar.0
mv foo.2 ok-mv-bar.2
diff --git a/src/libpcp/src/fetch.c b/src/libpcp/src/fetch.c
index 5328a2807..01d5bf7fc 100644
--- a/src/libpcp/src/fetch.c
+++ b/src/libpcp/src/fetch.c
@@ -458,6 +458,7 @@ pmSetMode(int mode, const struct timeval *when, int delta)
/* assume PM_CONTEXT_ARCHIVE */
if (l_mode == PM_MODE_INTERP ||
l_mode == PM_MODE_FORW || l_mode == PM_MODE_BACK) {
+ int lsts;
if (when != NULL) {
/*
* special case of NULL for timestamp
@@ -468,7 +469,18 @@ pmSetMode(int mode, const struct timeval *when, int delta)
}
ctxp->c_mode = mode;
ctxp->c_delta = delta;
- __pmLogSetTime(ctxp);
+ lsts = __pmLogSetTime(ctxp);
+ if (lsts < 0) {
+ /*
+ * most unlikely; not much we can do here but expect
+ * PMAPI error to be returned once pmFetch's start
+ */
+ if (pmDebugOptions.log) {
+ char errmsg[PM_MAXERRMSGLEN];
+ fprintf(stderr, "pmSetMode: __pmLogSetTime failed: %s\n",
+ pmErrStr_r(lsts, errmsg, sizeof(errmsg)));
+ }
+ }
__pmLogResetInterp(ctxp);
sts = 0;
}
diff --git a/src/libpcp/src/internal.h b/src/libpcp/src/internal.h
index 977efdcf6..fd8d6e740 100644
--- a/src/libpcp/src/internal.h
+++ b/src/libpcp/src/internal.h
@@ -407,7 +407,7 @@ extern int __pmLogGenerateMark(__pmLogCtl *, int, pmResult **) _PCP_HIDDEN;
extern int __pmLogFetchInterp(__pmContext *, int, pmID *, pmResult **) _PCP_HIDDEN;
extern int __pmGetArchiveLabel(__pmLogCtl *, pmLogLabel *) _PCP_HIDDEN;
extern pmTimeval *__pmLogStartTime(__pmArchCtl *) _PCP_HIDDEN;
-extern void __pmLogSetTime(__pmContext *) _PCP_HIDDEN;
+extern int __pmLogSetTime(__pmContext *) _PCP_HIDDEN;
extern void __pmLogResetInterp(__pmContext *) _PCP_HIDDEN;
extern void __pmArchCtlFree(__pmArchCtl *) _PCP_HIDDEN;
extern int __pmLogChangeArchive(__pmContext *, int) _PCP_HIDDEN;
diff --git a/src/libpcp/src/logutil.c b/src/libpcp/src/logutil.c
index 0ef76de25..2ea559bfe 100644
--- a/src/libpcp/src/logutil.c
+++ b/src/libpcp/src/logutil.c
@@ -1995,7 +1995,6 @@ __pmLogFetch(__pmContext *ctxp, int numpmid, pmID pmidlist[], pmResult **result)
sts = __pmLogChangeVol(ctxp->c_archctl, ctxp->c_archctl->ac_vol);
if (sts < 0)
goto func_return;
- assert(ctxp->c_archctl->ac_mfp != NULL);
__pmFseek(ctxp->c_archctl->ac_mfp,
(long)ctxp->c_archctl->ac_offset, SEEK_SET);
@@ -2010,7 +2009,9 @@ more:
* no serial access, so need to make sure we are
* starting in the correct place
*/
- __pmLogSetTime(ctxp);
+ sts = __pmLogSetTime(ctxp);
+ if (sts < 0)
+ goto func_return;
ctxp->c_archctl->ac_offset = __pmFtell(ctxp->c_archctl->ac_mfp);
ctxp->c_archctl->ac_vol = ctxp->c_archctl->ac_curvol;
/*
@@ -2299,7 +2300,7 @@ VolSkip(__pmArchCtl *acp, int mode, int j)
return PM_ERR_EOL;
}
-void
+int
__pmLogSetTime(__pmContext *ctxp)
{
__pmArchCtl *acp = ctxp->c_archctl;
@@ -2356,6 +2357,7 @@ __pmLogSetTime(__pmContext *ctxp)
if (lcp->l_numti) {
/* we have a temporal index, use it! */
int j = -1;
+ int try;
int toobig = 0;
int match = 0;
int vol;
@@ -2406,9 +2408,13 @@ __pmLogSetTime(__pmContext *ctxp)
acp->ac_serial = 1;
if (match) {
+ try = j;
j = VolSkip(acp, mode, j);
- if (j < 0)
- return;
+ if (j < 0) {
+ if (pmDebugOptions.log)
+ fprintf(stderr, "__pmLogSetTime: VolSkip mode=%d vol=%d failed #1\n", mode, try);
+ return PM_ERR_LOGFILE;
+ }
__pmFseek(acp->ac_mfp, (long)lcp->l_ti[j].ti_log, SEEK_SET);
if (mode == PM_MODE_BACK)
acp->ac_serial = 0;
@@ -2418,9 +2424,13 @@ __pmLogSetTime(__pmContext *ctxp)
}
}
else if (j < 1) {
+ try = 0;
j = VolSkip(acp, PM_MODE_FORW, 0);
- if (j < 0)
- return;
+ if (j < 0) {
+ if (pmDebugOptions.log)
+ fprintf(stderr, "__pmLogSetTime: VolSkip mode=%d vol=%d failed #2\n", PM_MODE_FORW, try);
+ return PM_ERR_LOGFILE;
+ }
__pmFseek(acp->ac_mfp, (long)lcp->l_ti[j].ti_log, SEEK_SET);
if (pmDebugOptions.log) {
fprintf(stderr, " before start ti@");
@@ -2428,9 +2438,13 @@ __pmLogSetTime(__pmContext *ctxp)
}
}
else if (j == numti) {
+ try = numti-1;
j = VolSkip(acp, PM_MODE_BACK, numti-1);
- if (j < 0)
- return;
+ if (j < 0) {
+ if (pmDebugOptions.log)
+ fprintf(stderr, "__pmLogSetTime: VolSkip mode=%d vol=%d failed #3\n", PM_MODE_BACK, try);
+ return PM_ERR_LOGFILE;
+ }
__pmFseek(acp->ac_mfp, (long)lcp->l_ti[j].ti_log, SEEK_SET);
if (mode == PM_MODE_BACK)
acp->ac_serial = 0;
@@ -2450,9 +2464,13 @@ __pmLogSetTime(__pmContext *ctxp)
t_hi = __pmTimevalSub(&lcp->l_ti[j].ti_stamp, &ctxp->c_origin);
t_lo = __pmTimevalSub(&ctxp->c_origin, &lcp->l_ti[j-1].ti_stamp);
if (t_hi <= t_lo && !toobig) {
+ try = j;
j = VolSkip(acp, mode, j);
- if (j < 0)
- return;
+ if (j < 0) {
+ if (pmDebugOptions.log)
+ fprintf(stderr, "__pmLogSetTime: VolSkip mode=%d vol=%d failed #4\n", mode, try);
+ return PM_ERR_LOGFILE;
+ }
__pmFseek(acp->ac_mfp, (long)lcp->l_ti[j].ti_log, SEEK_SET);
if (mode == PM_MODE_FORW)
acp->ac_serial = 0;
@@ -2462,9 +2480,13 @@ __pmLogSetTime(__pmContext *ctxp)
}
}
else {
+ try = j-1;
j = VolSkip(acp, mode, j-1);
- if (j < 0)
- return;
+ if (j < 0) {
+ if (pmDebugOptions.log)
+ fprintf(stderr, "__pmLogSetTime: VolSkip mode=%d vol=%d failed #5\n", mode, try);
+ return PM_ERR_LOGFILE;
+ }
__pmFseek(acp->ac_mfp, (long)lcp->l_ti[j].ti_log, SEEK_SET);
if (mode == PM_MODE_BACK)
acp->ac_serial = 0;
@@ -2490,14 +2512,37 @@ __pmLogSetTime(__pmContext *ctxp)
}
else {
/* index either not available, or not useful */
+ int j;
if (mode == PM_MODE_FORW) {
- __pmLogChangeVol(acp, lcp->l_minvol);
- assert(acp->ac_mfp != NULL);
+ for (j = lcp->l_minvol; j <= lcp->l_maxvol; j++) {
+ if (__pmLogChangeVol(acp, j) >= 0)
+ break;
+ }
+ if (j > lcp->l_maxvol) {
+ /* no volume found */
+ if (pmDebugOptions.log)
+ fprintf(stderr, " index not useful, no volume between %d...%d\n",
+ lcp->l_minvol, lcp->l_maxvol);
+ acp->ac_curvol = -1;
+ acp->ac_mfp = NULL;
+ return PM_ERR_LOGFILE;
+ }
__pmFseek(acp->ac_mfp, (long)(sizeof(__pmLogLabel) + 2*sizeof(int)), SEEK_SET);
}
else if (mode == PM_MODE_BACK) {
- __pmLogChangeVol(acp, lcp->l_maxvol);
- assert(acp->ac_mfp != NULL);
+ for (j = lcp->l_maxvol; j >= lcp->l_minvol; j--) {
+ if (__pmLogChangeVol(acp, j) >= 0)
+ break;
+ }
+ if (j < lcp->l_minvol) {
+ /* no volume found */
+ if (pmDebugOptions.log)
+ fprintf(stderr, " index not useful, no volume between %d...%d\n",
+ lcp->l_maxvol, lcp->l_minvol);
+ acp->ac_curvol = -1;
+ acp->ac_mfp = NULL;
+ return PM_ERR_LOGFILE;
+ }
__pmFseek(acp->ac_mfp, (long)0, SEEK_END);
}
@@ -2513,6 +2558,8 @@ __pmLogSetTime(__pmContext *ctxp)
acp->ac_offset = __pmFtell(acp->ac_mfp);
assert(acp->ac_offset >= 0);
acp->ac_vol = acp->ac_curvol;
+
+ return 0;
}
/* Read the label of the current archive. */
@@ -3100,6 +3147,7 @@ LogChangeToPreviousArchive(__pmContext *ctxp)
pmTimeval save_origin;
int save_mode;
int sts;
+ int j;
/*
* Check whether there is a previous archive to switch to.
@@ -3145,12 +3193,23 @@ LogChangeToPreviousArchive(__pmContext *ctxp)
}
/* Set up to scan backwards from the end of the archive. */
- __pmLogChangeVol(acp, lcp->l_maxvol);
- assert(acp->ac_mfp != NULL);
+ for (j = lcp->l_maxvol; j >= lcp->l_minvol; j--) {
+ if (__pmLogChangeVol(acp, j) >= 0)
+ break;
+ }
+ if (j < lcp->l_minvol) {
+ /* no volume found */
+ if (pmDebugOptions.log)
+ fprintf(stderr, "LogChangeToPreviousArchive: no volume between %d...%d\n",
+ lcp->l_maxvol, lcp->l_minvol);
+ acp->ac_curvol = -1;
+ acp->ac_mfp = NULL;
+ return PM_ERR_LOGFILE;
+ }
__pmFseek(acp->ac_mfp, (long)0, SEEK_END);
- ctxp->c_archctl->ac_offset = __pmFtell(acp->ac_mfp);
- assert(ctxp->c_archctl->ac_offset >= 0);
- ctxp->c_archctl->ac_vol = ctxp->c_archctl->ac_curvol;
+ acp->ac_offset = __pmFtell(acp->ac_mfp);
+ assert(acp->ac_offset >= 0);
+ acp->ac_vol = acp->ac_curvol;
/*
* Check for temporal overlap here. Do this last in case the API client
commit 5e3b792d3d8ae60f2cebbd51c37b9b0722c3b26e
Author: Mark Goodwin <mgoodwin@redhat.com>
Date: Tue Jul 6 20:09:28 2021 +1000
libpcp_web: plug mem leak in redisMapInsert during daily log-rolling
When pmlogger_daily processes daily archives, the resulting
merged archive(s) are discovered and processed by pmproxy
(if the discovery module is enabled). Since the metadata and
logvol data in each merged archive is likely to have already
been previously processed (but discovery doesn't know this),
we see a lot of dict updates for existing keys and values that
are already mapped.
Static analysis by Coverity (CID323605 Resource Leak) shows
when redisMapInsert calls dictAdd for an existing key, the
new value field is assigned but the old value is not free'd,
and so it leaks.
Related: RHBZ1975069 and https://github.com/performancecopilot/pcp/issues/1318
diff --git a/src/libpcp_web/src/maps.c b/src/libpcp_web/src/maps.c
index 013ef02d3..ce20476c9 100644
--- a/src/libpcp_web/src/maps.c
+++ b/src/libpcp_web/src/maps.c
@@ -160,6 +160,12 @@ redisMapLookup(redisMap *map, sds key)
void
redisMapInsert(redisMap *map, sds key, sds value)
{
+ redisMapEntry *entry = redisMapLookup(map, key);
+
+ if (entry) {
+ /* fix for Coverity CID323605 Resource Leak */
+ dictDelete(map, key);
+ }
dictAdd(map, key, value);
}
commit 2a00a90b0bc3aecb8465fd32aef1ddbe745b2c91
Author: Mark Goodwin <mgoodwin@redhat.com>
Date: Tue Jul 6 20:43:01 2021 +1000
libpcp_web/discovery: improve lock handling and scalability
Rework the global log-rolling lock detection with finer grain
(per-pmlogger directory) detection, and break early in
process_meta() and process_logvol() if a lock file is found
in the same directory as a monitored archive. This is much
more scalable since archive directories that are not locked
can continue to be processed and ingested whilst log-rolling
progresses elsewhere. Also uses much less CPU time since we
don't need a traversal of all monitored archives looking for
locks on every fs change event.
Also improve process_logvol/meta handling for archives that
are deleted whilst being processed by the discovery module.
In conjunction with Kenj's changes in libpcp - stop processing
metadata and logvols if pmFetchArchive returns -ENOENT .. the
archive has been deleted so there is no point further ingesting
it's data.
Related: RHBZ1975069
Related: https://github.com/performancecopilot/pcp/issues/1338
diff --git a/src/libpcp_web/src/discover.c b/src/libpcp_web/src/discover.c
index 991055ce5..964813f66 100644
--- a/src/libpcp_web/src/discover.c
+++ b/src/libpcp_web/src/discover.c
@@ -33,9 +33,6 @@ static char *pmDiscoverFlagsStr(pmDiscover *);
#define PM_DISCOVER_HASHTAB_SIZE 32
static pmDiscover *discover_hashtable[PM_DISCOVER_HASHTAB_SIZE];
-/* pmlogger_daily log-roll lock count */
-static int logrolling = 0;
-
/* number of archives or directories currently being monitored */