fixed issues with the prometheus code with subtle errors due to the use

of pointers. Now using simple value objects.
This commit is contained in:
Erik Brakkee 2024-08-12 18:48:28 +02:00
parent 31e59d998d
commit cd79c70822

View File

@ -15,8 +15,8 @@ const NAMESPACE = "converge"
var ( var (
// remember previous values of agent guids and clients so that we can increment // remember previous values of agent guids and clients so that we can increment
// the cumulative counters. // the cumulative counters.
lastAgents map[string]*models.Agent = make(map[string]*models.Agent) lastAgents map[string]models.Agent = make(map[string]models.Agent)
lastClients map[string]*models.Client = make(map[string]*models.Client) lastClients map[string]models.Client = make(map[string]models.Client)
cumulativeAgentCount = promauto.NewCounter(prometheus.CounterOpts{ cumulativeAgentCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: NAMESPACE, Namespace: NAMESPACE,
@ -130,19 +130,32 @@ func clientLabels(client models.Client) prometheus.Labels {
} }
func agentActive(agent models.Agent) { func agentActive(agent models.Agent) {
prevAgent := lastAgents[agent.Guid] prevAgent, ok := lastAgents[agent.Guid]
if prevAgent != nil && *prevAgent != agent { if ok && prevAgent != agent {
removeAgentMetrics(prevAgent) removeAgentInfoMetrics(prevAgent)
} }
agentInfo.With(agentLabels(agent)).Set(1) agentInfo.With(agentLabels(agent)).Set(1)
agentStartTime.
With(prometheus.Labels{"agent_guid": agent.Guid}).
Set(float64(agent.StartTime.UnixMilli()))
agentDuration.
With(prometheus.Labels{"agent_guid": agent.Guid}).
Set(float64(time.Now().Sub(agent.StartTime).Seconds()))
} }
func clientActive(client models.Client) { func clientActive(client models.Client) {
prevClient := lastClients[client.Guid] prevClient, ok := lastClients[client.Guid]
if prevClient != nil && *prevClient != client { if ok && prevClient != client {
removeClientMetrics(prevClient) removeClientInfoMetrics(prevClient)
} }
clientInfo.With(clientLabels(client)).Set(1) clientInfo.With(clientLabels(client)).Set(1)
clientStartTime.
With(prometheus.Labels{"client_guid": client.Guid}).
Set(float64(client.StartTime.UnixMilli()))
clientDuration.
With(prometheus.Labels{"client_guid": client.Guid}).
Set(float64(time.Now().Sub(client.StartTime).Seconds()))
} }
func setupPrometheus(mux *http.ServeMux, notifications chan *models.State) { func setupPrometheus(mux *http.ServeMux, notifications chan *models.State) {
@ -203,27 +216,21 @@ func updateMetricsImpl(state *models.State) {
// so that is still possible to identify the client or agent even though some values might // so that is still possible to identify the client or agent even though some values might
// become 0. // become 0.
agentGuids := make(map[string]*models.Agent) agentGuids := make(map[string]models.Agent)
clientGuids := make(map[string]*models.Client) clientGuids := make(map[string]models.Client)
agentCount.Set(float64(len(state.Agents))) agentCount.Set(float64(len(state.Agents)))
disconnectedAgents := make(map[string]*models.Agent) disconnectedAgents := make(map[string]models.Agent)
for k, v := range lastAgents { for k, v := range lastAgents {
disconnectedAgents[k] = v disconnectedAgents[k] = v
} }
for _, agent := range state.Agents { for _, agent := range state.Agents {
if lastAgents[agent.Guid] == nil { if _, ok := lastAgents[agent.Guid]; !ok {
cumulativeAgentCount.Inc() cumulativeAgentCount.Inc()
} }
delete(disconnectedAgents, agent.Guid) delete(disconnectedAgents, agent.Guid)
agentGuids[agent.Guid] = &agent agentGuids[agent.Guid] = agent
agentActive(agent) agentActive(agent)
agentStartTime.
With(prometheus.Labels{"agent_guid": agent.Guid}).
Set(float64(agent.StartTime.UnixMilli()))
agentDuration.
With(prometheus.Labels{"agent_guid": agent.Guid}).
Set(float64(time.Now().Sub(agent.StartTime).Seconds()))
} }
for _, agent := range disconnectedAgents { for _, agent := range disconnectedAgents {
removeAgentMetrics(agent) removeAgentMetrics(agent)
@ -233,23 +240,17 @@ func updateMetricsImpl(state *models.State) {
clientCount.Set(float64(len(state.Clients))) clientCount.Set(float64(len(state.Clients)))
// with this app // with this app
disconnectedClients := make(map[string]*models.Client) disconnectedClients := make(map[string]models.Client)
for k, v := range lastClients { for k, v := range lastClients {
disconnectedClients[k] = v disconnectedClients[k] = v
} }
for _, client := range state.Clients { for _, client := range state.Clients {
if lastClients[client.Guid] == nil { if _, ok := lastClients[client.Guid]; !ok {
cumulativeClientCount.Inc() cumulativeClientCount.Inc()
} }
delete(disconnectedClients, client.Guid) delete(disconnectedClients, client.Guid)
clientGuids[client.Guid] = &client clientGuids[client.Guid] = client
clientActive(client) clientActive(client)
clientStartTime.
With(prometheus.Labels{"client_guid": client.Guid}).
Set(float64(client.StartTime.UnixMilli()))
clientDuration.
With(prometheus.Labels{"client_guid": client.Guid}).
Set(float64(time.Now().Sub(client.StartTime).Seconds()))
} }
for _, client := range disconnectedClients { for _, client := range disconnectedClients {
removeClientMetrics(client) removeClientMetrics(client)
@ -257,8 +258,12 @@ func updateMetricsImpl(state *models.State) {
lastClients = clientGuids lastClients = clientGuids
} }
func removeAgentMetrics(agent *models.Agent) { func removeAgentInfoMetrics(agent models.Agent) bool {
ok1 := agentInfo.Delete(agentLabels(*agent)) return agentInfo.Delete(agentLabels(agent))
}
func removeAgentMetrics(agent models.Agent) {
ok1 := removeAgentInfoMetrics(agent)
guidLabels := prometheus.Labels{"agent_guid": agent.Guid} guidLabels := prometheus.Labels{"agent_guid": agent.Guid}
ok2 := agentStartTime.Delete(guidLabels) ok2 := agentStartTime.Delete(guidLabels)
// delayed deletion of the duration sow we are sure the prometheus has the last data. // delayed deletion of the duration sow we are sure the prometheus has the last data.
@ -270,12 +275,17 @@ func removeAgentMetrics(agent *models.Agent) {
} }
}() }()
if !ok1 || !ok2 { if !ok1 || !ok2 {
log.Printf("Could not delete all timeseries for agent %s", agent.Guid) log.Printf("Could not delete all timeseries for agent %s (info %v, starttime %v) ",
agent.Guid, ok1, ok2)
} }
} }
func removeClientMetrics(client *models.Client) { func removeClientInfoMetrics(client models.Client) bool {
ok1 := clientInfo.Delete(clientLabels(*client)) return clientInfo.Delete(clientLabels(client))
}
func removeClientMetrics(client models.Client) {
ok1 := removeClientInfoMetrics(client)
guidLabels := prometheus.Labels{"client_guid": client.Guid} guidLabels := prometheus.Labels{"client_guid": client.Guid}
ok2 := clientStartTime.Delete(guidLabels) ok2 := clientStartTime.Delete(guidLabels)
// delayed deletion of the duration sow we are sure the prometheus has the last data. // delayed deletion of the duration sow we are sure the prometheus has the last data.
@ -287,6 +297,6 @@ func removeClientMetrics(client *models.Client) {
} }
}() }()
if !ok1 || !ok2 { if !ok1 || !ok2 {
log.Printf("Could not delete all timeseries for client %s", client.Guid) log.Printf("Could not delete all timeseries for client %s (info %v, starttime %v)", client.Guid, ok1, ok2)
} }
} }