fixed issues with the prometheus code with subtle errors due to the use

of pointers. Now using simple value objects.
This commit is contained in:
Erik Brakkee 2024-08-12 18:48:28 +02:00
parent 31e59d998d
commit cd79c70822

View File

@ -15,8 +15,8 @@ const NAMESPACE = "converge"
var (
// remember previous values of agent guids and clients so that we can increment
// the cumulative counters.
lastAgents map[string]*models.Agent = make(map[string]*models.Agent)
lastClients map[string]*models.Client = make(map[string]*models.Client)
lastAgents map[string]models.Agent = make(map[string]models.Agent)
lastClients map[string]models.Client = make(map[string]models.Client)
cumulativeAgentCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: NAMESPACE,
@ -130,19 +130,32 @@ func clientLabels(client models.Client) prometheus.Labels {
}
func agentActive(agent models.Agent) {
prevAgent := lastAgents[agent.Guid]
if prevAgent != nil && *prevAgent != agent {
removeAgentMetrics(prevAgent)
prevAgent, ok := lastAgents[agent.Guid]
if ok && prevAgent != agent {
removeAgentInfoMetrics(prevAgent)
}
agentInfo.With(agentLabels(agent)).Set(1)
agentStartTime.
With(prometheus.Labels{"agent_guid": agent.Guid}).
Set(float64(agent.StartTime.UnixMilli()))
agentDuration.
With(prometheus.Labels{"agent_guid": agent.Guid}).
Set(float64(time.Now().Sub(agent.StartTime).Seconds()))
}
func clientActive(client models.Client) {
prevClient := lastClients[client.Guid]
if prevClient != nil && *prevClient != client {
removeClientMetrics(prevClient)
prevClient, ok := lastClients[client.Guid]
if ok && prevClient != client {
removeClientInfoMetrics(prevClient)
}
clientInfo.With(clientLabels(client)).Set(1)
clientStartTime.
With(prometheus.Labels{"client_guid": client.Guid}).
Set(float64(client.StartTime.UnixMilli()))
clientDuration.
With(prometheus.Labels{"client_guid": client.Guid}).
Set(float64(time.Now().Sub(client.StartTime).Seconds()))
}
func setupPrometheus(mux *http.ServeMux, notifications chan *models.State) {
@ -203,27 +216,21 @@ func updateMetricsImpl(state *models.State) {
// so that is still possible to identify the client or agent even though some values might
// become 0.
agentGuids := make(map[string]*models.Agent)
clientGuids := make(map[string]*models.Client)
agentGuids := make(map[string]models.Agent)
clientGuids := make(map[string]models.Client)
agentCount.Set(float64(len(state.Agents)))
disconnectedAgents := make(map[string]*models.Agent)
disconnectedAgents := make(map[string]models.Agent)
for k, v := range lastAgents {
disconnectedAgents[k] = v
}
for _, agent := range state.Agents {
if lastAgents[agent.Guid] == nil {
if _, ok := lastAgents[agent.Guid]; !ok {
cumulativeAgentCount.Inc()
}
delete(disconnectedAgents, agent.Guid)
agentGuids[agent.Guid] = &agent
agentGuids[agent.Guid] = agent
agentActive(agent)
agentStartTime.
With(prometheus.Labels{"agent_guid": agent.Guid}).
Set(float64(agent.StartTime.UnixMilli()))
agentDuration.
With(prometheus.Labels{"agent_guid": agent.Guid}).
Set(float64(time.Now().Sub(agent.StartTime).Seconds()))
}
for _, agent := range disconnectedAgents {
removeAgentMetrics(agent)
@ -233,23 +240,17 @@ func updateMetricsImpl(state *models.State) {
clientCount.Set(float64(len(state.Clients)))
// with this app
disconnectedClients := make(map[string]*models.Client)
disconnectedClients := make(map[string]models.Client)
for k, v := range lastClients {
disconnectedClients[k] = v
}
for _, client := range state.Clients {
if lastClients[client.Guid] == nil {
if _, ok := lastClients[client.Guid]; !ok {
cumulativeClientCount.Inc()
}
delete(disconnectedClients, client.Guid)
clientGuids[client.Guid] = &client
clientGuids[client.Guid] = client
clientActive(client)
clientStartTime.
With(prometheus.Labels{"client_guid": client.Guid}).
Set(float64(client.StartTime.UnixMilli()))
clientDuration.
With(prometheus.Labels{"client_guid": client.Guid}).
Set(float64(time.Now().Sub(client.StartTime).Seconds()))
}
for _, client := range disconnectedClients {
removeClientMetrics(client)
@ -257,8 +258,12 @@ func updateMetricsImpl(state *models.State) {
lastClients = clientGuids
}
func removeAgentMetrics(agent *models.Agent) {
ok1 := agentInfo.Delete(agentLabels(*agent))
func removeAgentInfoMetrics(agent models.Agent) bool {
return agentInfo.Delete(agentLabels(agent))
}
func removeAgentMetrics(agent models.Agent) {
ok1 := removeAgentInfoMetrics(agent)
guidLabels := prometheus.Labels{"agent_guid": agent.Guid}
ok2 := agentStartTime.Delete(guidLabels)
// delayed deletion of the duration sow we are sure the prometheus has the last data.
@ -270,12 +275,17 @@ func removeAgentMetrics(agent *models.Agent) {
}
}()
if !ok1 || !ok2 {
log.Printf("Could not delete all timeseries for agent %s", agent.Guid)
log.Printf("Could not delete all timeseries for agent %s (info %v, starttime %v) ",
agent.Guid, ok1, ok2)
}
}
func removeClientMetrics(client *models.Client) {
ok1 := clientInfo.Delete(clientLabels(*client))
func removeClientInfoMetrics(client models.Client) bool {
return clientInfo.Delete(clientLabels(client))
}
func removeClientMetrics(client models.Client) {
ok1 := removeClientInfoMetrics(client)
guidLabels := prometheus.Labels{"client_guid": client.Guid}
ok2 := clientStartTime.Delete(guidLabels)
// delayed deletion of the duration sow we are sure the prometheus has the last data.
@ -287,6 +297,6 @@ func removeClientMetrics(client *models.Client) {
}
}()
if !ok1 || !ok2 {
log.Printf("Could not delete all timeseries for client %s", client.Guid)
log.Printf("Could not delete all timeseries for client %s (info %v, starttime %v)", client.Guid, ok1, ok2)
}
}