Mohamed Abukar | 3e03815 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2020 AT&T Intellectual Property. |
| 3 | * Copyright (c) 2020 Nokia. |
| 4 | * |
| 5 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | * you may not use this file except in compliance with the License. |
| 7 | * You may obtain a copy of the License at |
| 8 | * |
| 9 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | * |
| 11 | * Unless required by applicable law or agreed to in writing, software |
| 12 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | * See the License for the specific language governing permissions and |
| 15 | * limitations under the License. |
| 16 | * |
| 17 | * This source code is part of the near-RT RIC (RAN Intelligent Controller) |
| 18 | * platform project (RICP). |
| 19 | */ |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 20 | |
Mohamed Abukar | 3e03815 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 21 | package main |
| 22 | |
| 23 | import ( |
Mohamed Abukar | 105030f | 2020-10-22 18:08:34 +0300 | [diff] [blame] | 24 | "bytes" |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 25 | "encoding/json" |
| 26 | "fmt" |
Anssi Mannila | 18fd03c | 2020-10-29 10:01:00 +0200 | [diff] [blame] | 27 | "io/ioutil" |
| 28 | "net/http" |
| 29 | "os" |
| 30 | "time" |
| 31 | |
vipin | 14323a9 | 2020-09-25 10:03:43 +0000 | [diff] [blame] | 32 | "gerrit.o-ran-sc.org/r/ric-plt/alarm-go/alarm" |
| 33 | app "gerrit.o-ran-sc.org/r/ric-plt/xapp-frame/pkg/xapp" |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 34 | clientruntime "github.com/go-openapi/runtime/client" |
| 35 | "github.com/go-openapi/strfmt" |
| 36 | "github.com/prometheus/alertmanager/api/v2/client" |
| 37 | "github.com/prometheus/alertmanager/api/v2/client/alert" |
| 38 | "github.com/prometheus/alertmanager/api/v2/models" |
| 39 | "github.com/spf13/viper" |
Mohamed Abukar | 3e03815 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 40 | ) |
| 41 | |
Mohamed Abukar | 3649fae | 2020-10-30 23:51:39 +0200 | [diff] [blame] | 42 | func (a *AlarmManager) ClearExpiredAlarms(m AlarmNotification, idx int, mLocked bool) bool { |
| 43 | d, ok := alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem] |
| 44 | if !ok || d.TimeToLive == 0 { |
| 45 | return false |
| 46 | } |
| 47 | |
| 48 | elapsed := (time.Now().UnixNano() - m.AlarmTime) / 1e9 |
| 49 | if int(elapsed) >= d.TimeToLive { |
| 50 | app.Logger.Info("Alarm (sp=%d id=%d) with TTL=%d expired, clearing ...", m.Alarm.SpecificProblem, m.AlarmId, d.TimeToLive) |
| 51 | |
| 52 | m.AlarmAction = alarm.AlarmActionClear |
| 53 | m.AlarmTime = time.Now().UnixNano() |
| 54 | |
| 55 | if !mLocked { // For testing purpose |
| 56 | a.mutex.Lock() |
| 57 | } |
| 58 | a.ProcessClearAlarm(&m, d, idx) |
| 59 | return true |
| 60 | } |
| 61 | return false |
| 62 | } |
| 63 | |
| 64 | func (a *AlarmManager) StartTTLTimer(interval int) { |
| 65 | tick := time.Tick(time.Duration(interval) * time.Second) |
| 66 | for range tick { |
| 67 | a.mutex.Lock() |
| 68 | for idx, m := range a.activeAlarms { |
| 69 | if a.ClearExpiredAlarms(m, idx, true) { |
| 70 | a.mutex.Lock() // ClearExpiredAlarms unlocks the mutex, so re-lock here |
| 71 | continue |
| 72 | } |
| 73 | } |
| 74 | a.mutex.Unlock() |
| 75 | } |
| 76 | } |
| 77 | |
Abukar Mohamed | 121e8b6 | 2020-09-18 11:41:33 +0000 | [diff] [blame] | 78 | func (a *AlarmManager) StartAlertTimer() { |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 79 | tick := time.Tick(time.Duration(a.alertInterval) * time.Millisecond) |
| 80 | for range tick { |
Mohamed Abukar | af0c570 | 2020-03-11 10:29:40 +0200 | [diff] [blame] | 81 | a.mutex.Lock() |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 82 | for _, m := range a.activeAlarms { |
| 83 | app.Logger.Info("Re-raising alarm: %v", m) |
vipin | ba2ef5b | 2020-11-06 11:24:48 +0000 | [diff] [blame] | 84 | a.PostAlert(a.GenerateAlertLabels(m.AlarmId, m.Alarm, AlertStatusActive, m.AlarmTime)) |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 85 | } |
Mohamed Abukar | af0c570 | 2020-03-11 10:29:40 +0200 | [diff] [blame] | 86 | a.mutex.Unlock() |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 87 | } |
| 88 | } |
| 89 | |
Abukar Mohamed | 121e8b6 | 2020-09-18 11:41:33 +0000 | [diff] [blame] | 90 | func (a *AlarmManager) Consume(rp *app.RMRParams) (err error) { |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 91 | app.Logger.Info("Message received!") |
| 92 | |
| 93 | defer app.Rmr.Free(rp.Mbuf) |
| 94 | switch rp.Mtype { |
| 95 | case alarm.RIC_ALARM_UPDATE: |
| 96 | a.HandleAlarms(rp) |
| 97 | default: |
| 98 | app.Logger.Info("Unknown Message Type '%d', discarding", rp.Mtype) |
| 99 | } |
| 100 | |
| 101 | return nil |
| 102 | } |
| 103 | |
Abukar Mohamed | 121e8b6 | 2020-09-18 11:41:33 +0000 | [diff] [blame] | 104 | func (a *AlarmManager) HandleAlarms(rp *app.RMRParams) (*alert.PostAlertsOK, error) { |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 105 | var m alarm.AlarmMessage |
Lott, Christopher (cl778h) | 3e8e2aa | 2020-06-03 08:52:14 -0400 | [diff] [blame] | 106 | app.Logger.Info("Received JSON: %s", rp.Payload) |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 107 | if err := json.Unmarshal(rp.Payload, &m); err != nil { |
| 108 | app.Logger.Error("json.Unmarshal failed: %v", err) |
| 109 | return nil, err |
| 110 | } |
| 111 | app.Logger.Info("newAlarm: %v", m) |
| 112 | |
Mohamed Abukar | 105030f | 2020-10-22 18:08:34 +0300 | [diff] [blame] | 113 | return a.ProcessAlarm(&AlarmNotification{m, alarm.AlarmDefinition{}}) |
Mohamed Abukar | 540ceee | 2020-09-09 08:07:40 +0300 | [diff] [blame] | 114 | } |
| 115 | |
Mohamed Abukar | 105030f | 2020-10-22 18:08:34 +0300 | [diff] [blame] | 116 | func (a *AlarmManager) ProcessAlarm(m *AlarmNotification) (*alert.PostAlertsOK, error) { |
vipin | 6f73fa3 | 2020-10-06 06:51:53 +0000 | [diff] [blame] | 117 | a.mutex.Lock() |
Anssi Mannila | 18fd03c | 2020-10-29 10:01:00 +0200 | [diff] [blame] | 118 | alarmDef := &alarm.AlarmDefinition{} |
| 119 | var ok bool |
| 120 | if alarmDef, ok = alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem]; !ok { |
Mohamed Abukar | 540ceee | 2020-09-09 08:07:40 +0300 | [diff] [blame] | 121 | app.Logger.Warn("Alarm (SP='%d') not recognized, suppressing ...", m.Alarm.SpecificProblem) |
vipin | 6f73fa3 | 2020-10-06 06:51:53 +0000 | [diff] [blame] | 122 | a.mutex.Unlock() |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 123 | return nil, nil |
| 124 | } |
| 125 | |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 126 | idx, found := a.IsMatchFound(m.Alarm) |
Anssi Mannila | 18fd03c | 2020-10-29 10:01:00 +0200 | [diff] [blame] | 127 | // Suppress duplicate alarms |
vipin | 4cedd50 | 2020-09-25 05:58:31 +0000 | [diff] [blame] | 128 | if found && m.AlarmAction == alarm.AlarmActionRaise { |
Mohamed Abukar | 540ceee | 2020-09-09 08:07:40 +0300 | [diff] [blame] | 129 | app.Logger.Info("Duplicate alarm found, suppressing ...") |
Anssi Mannila | fe07bd1 | 2020-09-24 14:02:57 +0300 | [diff] [blame] | 130 | if m.PerceivedSeverity == a.activeAlarms[idx].PerceivedSeverity { |
| 131 | // Duplicate with same severity found |
vipin | 6f73fa3 | 2020-10-06 06:51:53 +0000 | [diff] [blame] | 132 | a.mutex.Unlock() |
Anssi Mannila | fe07bd1 | 2020-09-24 14:02:57 +0300 | [diff] [blame] | 133 | return nil, nil |
| 134 | } else { |
| 135 | // Remove duplicate with different severity |
| 136 | a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active") |
| 137 | } |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 138 | } |
| 139 | |
| 140 | // Clear alarm if found from active alarm list |
Mohamed Abukar | 2336a84 | 2020-10-30 16:19:38 +0200 | [diff] [blame] | 141 | if found && m.AlarmAction == alarm.AlarmActionClear { |
| 142 | return a.ProcessClearAlarm(m, alarmDef, idx) |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 143 | } |
| 144 | |
| 145 | // New alarm -> update active alarms and post to Alert Manager |
| 146 | if m.AlarmAction == alarm.AlarmActionRaise { |
Mohamed Abukar | 2336a84 | 2020-10-30 16:19:38 +0200 | [diff] [blame] | 147 | return a.ProcessRaiseAlarm(m, alarmDef) |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 148 | } |
| 149 | |
vipin | 6f73fa3 | 2020-10-06 06:51:53 +0000 | [diff] [blame] | 150 | a.mutex.Unlock() |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 151 | return nil, nil |
| 152 | } |
| 153 | |
Mohamed Abukar | 2336a84 | 2020-10-30 16:19:38 +0200 | [diff] [blame] | 154 | func (a *AlarmManager) ProcessRaiseAlarm(m *AlarmNotification, alarmDef *alarm.AlarmDefinition) (*alert.PostAlertsOK, error) { |
Anssi Mannila | 18fd03c | 2020-10-29 10:01:00 +0200 | [diff] [blame] | 155 | app.Logger.Debug("Raise alarmDef.RaiseDelay = %v, AlarmNotification = %v", alarmDef.RaiseDelay, *m) |
Mohamed Abukar | 2336a84 | 2020-10-30 16:19:38 +0200 | [diff] [blame] | 156 | |
Anssi Mannila | 18fd03c | 2020-10-29 10:01:00 +0200 | [diff] [blame] | 157 | // RaiseDelay > 0 in an alarm object in active alarm table indicates that raise delay is still ongoing for the alarm |
| 158 | m.AlarmDefinition.RaiseDelay = alarmDef.RaiseDelay |
| 159 | a.UpdateAlarmFields(a.GenerateAlarmId(), m) |
| 160 | a.UpdateActiveAlarmList(m) |
| 161 | a.mutex.Unlock() |
Mohamed Abukar | 2336a84 | 2020-10-30 16:19:38 +0200 | [diff] [blame] | 162 | |
Anssi Mannila | 18fd03c | 2020-10-29 10:01:00 +0200 | [diff] [blame] | 163 | if alarmDef.RaiseDelay > 0 { |
| 164 | timerDelay(alarmDef.RaiseDelay) |
| 165 | a.mutex.Lock() |
| 166 | // Alarm may have been deleted from active alarms table during delay or table index may have changed |
| 167 | idx, found := a.IsMatchFound(m.Alarm) |
| 168 | if found { |
| 169 | // Alarm is not showed in active alarms or alarm history via CLI before RaiseDelay has elapsed, i.e the value is 0 |
| 170 | a.activeAlarms[idx].AlarmDefinition.RaiseDelay = 0 |
| 171 | app.Logger.Debug("Raise after delay alarmDef.RaiseDelay = %v, AlarmNotification = %v", alarmDef.RaiseDelay, *m) |
| 172 | a.mutex.Unlock() |
| 173 | } else { |
| 174 | app.Logger.Debug("Alarm deleted during raise delay. AlarmNotification = %v", *m) |
| 175 | a.mutex.Unlock() |
Mohamed Abukar | 2336a84 | 2020-10-30 16:19:38 +0200 | [diff] [blame] | 176 | return nil, nil |
Anssi Mannila | 18fd03c | 2020-10-29 10:01:00 +0200 | [diff] [blame] | 177 | } |
| 178 | } |
Mohamed Abukar | 2336a84 | 2020-10-30 16:19:38 +0200 | [diff] [blame] | 179 | |
Anssi Mannila | 18fd03c | 2020-10-29 10:01:00 +0200 | [diff] [blame] | 180 | m.AlarmDefinition.RaiseDelay = 0 |
| 181 | a.UpdateAlarmHistoryList(m) |
Mohamed Abukar | 2336a84 | 2020-10-30 16:19:38 +0200 | [diff] [blame] | 182 | a.WriteAlarmInfoToPersistentVolume() |
| 183 | |
| 184 | // Send alarm notification to NOMA, if enabled |
| 185 | if app.Config.GetBool("controls.noma.enabled") { |
| 186 | return a.PostAlarm(m) |
| 187 | } |
vipin | ba2ef5b | 2020-11-06 11:24:48 +0000 | [diff] [blame] | 188 | return a.PostAlert(a.GenerateAlertLabels(m.AlarmId, m.Alarm, AlertStatusActive, m.AlarmTime)) |
Anssi Mannila | 18fd03c | 2020-10-29 10:01:00 +0200 | [diff] [blame] | 189 | } |
| 190 | |
Mohamed Abukar | 2336a84 | 2020-10-30 16:19:38 +0200 | [diff] [blame] | 191 | func (a *AlarmManager) ProcessClearAlarm(m *AlarmNotification, alarmDef *alarm.AlarmDefinition, idx int) (*alert.PostAlertsOK, error) { |
Anssi Mannila | 18fd03c | 2020-10-29 10:01:00 +0200 | [diff] [blame] | 192 | app.Logger.Debug("Clear alarmDef.ClearDelay = %v, AlarmNotification = %v", alarmDef.ClearDelay, *m) |
| 193 | if alarmDef.ClearDelay > 0 { |
| 194 | a.mutex.Unlock() |
| 195 | timerDelay(alarmDef.ClearDelay) |
| 196 | app.Logger.Debug("Clear after delay alarmDef.ClearDelay = %v, AlarmNotification = %v", alarmDef.ClearDelay, *m) |
| 197 | a.mutex.Lock() |
| 198 | // Another alarm clear may have happened during delay and active alarms table index changed |
| 199 | var found bool |
| 200 | idx, found = a.IsMatchFound(m.Alarm) |
| 201 | if !found { |
Anssi Mannila | 18fd03c | 2020-10-29 10:01:00 +0200 | [diff] [blame] | 202 | a.mutex.Unlock() |
Mohamed Abukar | 2336a84 | 2020-10-30 16:19:38 +0200 | [diff] [blame] | 203 | return nil, nil |
Anssi Mannila | 18fd03c | 2020-10-29 10:01:00 +0200 | [diff] [blame] | 204 | } |
| 205 | } |
| 206 | a.UpdateAlarmFields(a.activeAlarms[idx].AlarmId, m) |
| 207 | a.alarmHistory = append(a.alarmHistory, *m) |
| 208 | a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active") |
| 209 | if (len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false) { |
| 210 | app.Logger.Warn("alarm history count exceeded maxAlarmHistory threshold") |
| 211 | a.GenerateThresholdAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, "history") |
| 212 | } |
| 213 | |
| 214 | if a.exceededActiveAlarmOn && m.Alarm.SpecificProblem == alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD { |
| 215 | a.exceededActiveAlarmOn = false |
| 216 | } |
| 217 | |
| 218 | if a.exceededAlarmHistoryOn && m.Alarm.SpecificProblem == alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD { |
| 219 | a.exceededAlarmHistoryOn = false |
| 220 | } |
Mohamed Abukar | 2336a84 | 2020-10-30 16:19:38 +0200 | [diff] [blame] | 221 | a.WriteAlarmInfoToPersistentVolume() |
| 222 | |
| 223 | a.mutex.Unlock() |
| 224 | if a.postClear && app.Config.GetBool("controls.noma.enabled") { |
| 225 | m.PerceivedSeverity = alarm.SeverityCleared |
| 226 | return a.PostAlarm(m) |
| 227 | } |
| 228 | return nil, nil |
Anssi Mannila | 18fd03c | 2020-10-29 10:01:00 +0200 | [diff] [blame] | 229 | } |
| 230 | |
| 231 | func timerDelay(delay int) { |
| 232 | timer := time.NewTimer(time.Duration(delay) * time.Second) |
| 233 | <-timer.C |
| 234 | } |
| 235 | |
Abukar Mohamed | 121e8b6 | 2020-09-18 11:41:33 +0000 | [diff] [blame] | 236 | func (a *AlarmManager) IsMatchFound(newAlarm alarm.Alarm) (int, bool) { |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 237 | for i, m := range a.activeAlarms { |
| 238 | if m.ManagedObjectId == newAlarm.ManagedObjectId && m.ApplicationId == newAlarm.ApplicationId && |
Mohamed Abukar | 0c38973 | 2020-09-17 14:47:50 +0300 | [diff] [blame] | 239 | m.SpecificProblem == newAlarm.SpecificProblem && m.IdentifyingInfo == newAlarm.IdentifyingInfo { |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 240 | return i, true |
| 241 | } |
| 242 | } |
| 243 | return -1, false |
| 244 | } |
| 245 | |
Mohamed Abukar | 105030f | 2020-10-22 18:08:34 +0300 | [diff] [blame] | 246 | func (a *AlarmManager) RemoveAlarm(alarms []AlarmNotification, i int, listName string) []AlarmNotification { |
Mohamed Abukar | 540ceee | 2020-09-09 08:07:40 +0300 | [diff] [blame] | 247 | app.Logger.Info("Alarm '%+v' deleted from the '%s' list", alarms[i], listName) |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 248 | copy(alarms[i:], alarms[i+1:]) |
| 249 | return alarms[:len(alarms)-1] |
| 250 | } |
| 251 | |
Mohamed Abukar | 105030f | 2020-10-22 18:08:34 +0300 | [diff] [blame] | 252 | func (a *AlarmManager) GenerateAlarmId() int { |
Mohamed Abukar | f5a8e71 | 2020-10-19 16:58:17 +0300 | [diff] [blame] | 253 | a.uniqueAlarmId++ // @todo: generate a unique ID |
Mohamed Abukar | 105030f | 2020-10-22 18:08:34 +0300 | [diff] [blame] | 254 | return a.uniqueAlarmId |
| 255 | } |
| 256 | |
| 257 | func (a *AlarmManager) UpdateAlarmFields(alarmId int, newAlarm *AlarmNotification) { |
| 258 | alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem] |
| 259 | newAlarm.AlarmId = alarmId |
Mohamed Abukar | f5a8e71 | 2020-10-19 16:58:17 +0300 | [diff] [blame] | 260 | newAlarm.AlarmText = alarmDef.AlarmText |
| 261 | newAlarm.EventType = alarmDef.EventType |
| 262 | } |
| 263 | |
Mohamed Abukar | 105030f | 2020-10-22 18:08:34 +0300 | [diff] [blame] | 264 | func (a *AlarmManager) GenerateThresholdAlarm(sp int, data string) bool { |
| 265 | thresholdAlarm := a.alarmClient.NewAlarm(sp, alarm.SeverityWarning, "threshold", data) |
| 266 | thresholdMessage := alarm.AlarmMessage{ |
| 267 | Alarm: thresholdAlarm, |
| 268 | AlarmAction: alarm.AlarmActionRaise, |
Mohamed Abukar | 3649fae | 2020-10-30 23:51:39 +0200 | [diff] [blame] | 269 | AlarmTime: time.Now().UnixNano(), |
Mohamed Abukar | 105030f | 2020-10-22 18:08:34 +0300 | [diff] [blame] | 270 | } |
vipin | 78b2b0a | 2020-10-28 10:10:18 +0000 | [diff] [blame] | 271 | alarmDef := alarm.RICAlarmDefinitions[sp] |
| 272 | alarmId := a.GenerateAlarmId() |
| 273 | alarmDef.AlarmId = alarmId |
| 274 | a.activeAlarms = append(a.activeAlarms, AlarmNotification{thresholdMessage, *alarmDef}) |
| 275 | a.alarmHistory = append(a.alarmHistory, AlarmNotification{thresholdMessage, *alarmDef}) |
Mohamed Abukar | 105030f | 2020-10-22 18:08:34 +0300 | [diff] [blame] | 276 | |
| 277 | return true |
| 278 | } |
| 279 | |
Anssi Mannila | 18fd03c | 2020-10-29 10:01:00 +0200 | [diff] [blame] | 280 | func (a *AlarmManager) UpdateActiveAlarmList(newAlarm *AlarmNotification) { |
vipin | 541eb50 | 2020-09-22 12:04:59 +0000 | [diff] [blame] | 281 | /* If maximum number of active alarms is reached, an error log writing is made, and new alarm indicating the problem is raised. |
Anssi Mannila | 18fd03c | 2020-10-29 10:01:00 +0200 | [diff] [blame] | 282 | The attempt to raise the alarm next time will be suppressed when found as duplicate. */ |
Mohamed Abukar | f5a8e71 | 2020-10-19 16:58:17 +0300 | [diff] [blame] | 283 | if (len(a.activeAlarms) >= a.maxActiveAlarms) && (a.exceededActiveAlarmOn == false) { |
Mohamed Abukar | 105030f | 2020-10-22 18:08:34 +0300 | [diff] [blame] | 284 | app.Logger.Warn("active alarm count exceeded maxActiveAlarms threshold") |
| 285 | a.exceededActiveAlarmOn = a.GenerateThresholdAlarm(alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD, "active") |
Mohamed Abukar | 540ceee | 2020-09-09 08:07:40 +0300 | [diff] [blame] | 286 | } |
| 287 | |
Anssi Mannila | 18fd03c | 2020-10-29 10:01:00 +0200 | [diff] [blame] | 288 | // @todo: For now just keep the active alarms in-memory. Use SDL later for persistence |
| 289 | a.activeAlarms = append(a.activeAlarms, *newAlarm) |
| 290 | } |
| 291 | |
| 292 | func (a *AlarmManager) UpdateAlarmHistoryList(newAlarm *AlarmNotification) { |
| 293 | /* If maximum number of events in alarm history is reached, an error log writing is made, |
| 294 | and new alarm indicating the problem is raised. The attempt to add new event time will |
| 295 | be suppressed */ |
| 296 | |
Mohamed Abukar | f5a8e71 | 2020-10-19 16:58:17 +0300 | [diff] [blame] | 297 | if (len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false) { |
Mohamed Abukar | 105030f | 2020-10-22 18:08:34 +0300 | [diff] [blame] | 298 | app.Logger.Warn("alarm history count exceeded maxAlarmHistory threshold") |
| 299 | a.exceededAlarmHistoryOn = a.GenerateThresholdAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, "history") |
Mohamed Abukar | 540ceee | 2020-09-09 08:07:40 +0300 | [diff] [blame] | 300 | } |
| 301 | |
Anssi Mannila | 18fd03c | 2020-10-29 10:01:00 +0200 | [diff] [blame] | 302 | // @todo: For now just keep the alarms history in-memory. Use SDL later for persistence |
Mohamed Abukar | 0c38973 | 2020-09-17 14:47:50 +0300 | [diff] [blame] | 303 | a.alarmHistory = append(a.alarmHistory, *newAlarm) |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 304 | } |
| 305 | |
Mohamed Abukar | 105030f | 2020-10-22 18:08:34 +0300 | [diff] [blame] | 306 | func (a *AlarmManager) PostAlarm(m *AlarmNotification) (*alert.PostAlertsOK, error) { |
| 307 | result, err := json.Marshal(m) |
| 308 | if err != nil { |
| 309 | app.Logger.Info("json.Marshal failed: %v", err) |
| 310 | return nil, err |
| 311 | } |
| 312 | |
| 313 | fullUrl := fmt.Sprintf("%s/%s", app.Config.GetString("controls.noma.host"), app.Config.GetString("controls.noma.alarmUrl")) |
| 314 | app.Logger.Info("Posting alarm to '%s'", fullUrl) |
| 315 | |
| 316 | resp, err := http.Post(fullUrl, "application/json", bytes.NewReader(result)) |
| 317 | if err != nil || resp == nil { |
| 318 | app.Logger.Info("Unable to post alarm to '%s': %v", fullUrl, err) |
| 319 | } |
| 320 | |
| 321 | return nil, err |
| 322 | } |
| 323 | |
vipin | ba2ef5b | 2020-11-06 11:24:48 +0000 | [diff] [blame] | 324 | func (a *AlarmManager) GenerateAlertLabels(alarmId int, newAlarm alarm.Alarm, status AlertStatus, alarmTime int64) (models.LabelSet, models.LabelSet) { |
Mohamed Abukar | af0c570 | 2020-03-11 10:29:40 +0200 | [diff] [blame] | 325 | alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem] |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 326 | amLabels := models.LabelSet{ |
Mohamed Abukar | b2f29a8 | 2020-03-17 09:31:55 +0200 | [diff] [blame] | 327 | "status": string(status), |
Mohamed Abukar | af0c570 | 2020-03-11 10:29:40 +0200 | [diff] [blame] | 328 | "alertname": alarmDef.AlarmText, |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 329 | "severity": string(newAlarm.PerceivedSeverity), |
Mohamed Abukar | 105030f | 2020-10-22 18:08:34 +0300 | [diff] [blame] | 330 | "service": fmt.Sprintf("%s/%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId), |
| 331 | "system_name": "RIC", |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 332 | } |
| 333 | amAnnotations := models.LabelSet{ |
vipin | ba2ef5b | 2020-11-06 11:24:48 +0000 | [diff] [blame] | 334 | "alarm_id": fmt.Sprintf("%d", alarmId), |
Mohamed Abukar | 105030f | 2020-10-22 18:08:34 +0300 | [diff] [blame] | 335 | "specific_problem": fmt.Sprintf("%d", newAlarm.SpecificProblem), |
| 336 | "event_type": alarmDef.EventType, |
| 337 | "identifying_info": newAlarm.IdentifyingInfo, |
| 338 | "additional_info": newAlarm.AdditionalInfo, |
| 339 | "description": fmt.Sprintf("%s:%s", newAlarm.IdentifyingInfo, newAlarm.AdditionalInfo), |
| 340 | "instructions": alarmDef.OperationInstructions, |
| 341 | "timestamp": fmt.Sprintf("%s", time.Unix(0, alarmTime).Format("02/01/2006, 15:04:05")), |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 342 | } |
| 343 | |
| 344 | return amLabels, amAnnotations |
| 345 | } |
| 346 | |
Abukar Mohamed | 121e8b6 | 2020-09-18 11:41:33 +0000 | [diff] [blame] | 347 | func (a *AlarmManager) NewAlertmanagerClient() *client.Alertmanager { |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 348 | cr := clientruntime.New(a.amHost, a.amBaseUrl, a.amSchemes) |
| 349 | return client.New(cr, strfmt.Default) |
| 350 | } |
| 351 | |
Abukar Mohamed | 121e8b6 | 2020-09-18 11:41:33 +0000 | [diff] [blame] | 352 | func (a *AlarmManager) PostAlert(amLabels, amAnnotations models.LabelSet) (*alert.PostAlertsOK, error) { |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 353 | pa := &models.PostableAlert{ |
| 354 | Alert: models.Alert{ |
| 355 | GeneratorURL: strfmt.URI(""), |
| 356 | Labels: amLabels, |
| 357 | }, |
| 358 | Annotations: amAnnotations, |
| 359 | } |
| 360 | alertParams := alert.NewPostAlertsParams().WithAlerts(models.PostableAlerts{pa}) |
| 361 | |
Mohamed Abukar | 643241f | 2020-06-09 15:26:00 +0300 | [diff] [blame] | 362 | app.Logger.Info("Posting alerts: labels: %+v, annotations: %+v", amLabels, amAnnotations) |
Mohamed Abukar | af0c570 | 2020-03-11 10:29:40 +0200 | [diff] [blame] | 363 | ok, err := a.NewAlertmanagerClient().Alert.PostAlerts(alertParams) |
| 364 | if err != nil { |
| 365 | app.Logger.Error("Posting alerts to '%s/%s' failed with error: %v", a.amHost, a.amBaseUrl, err) |
| 366 | } |
| 367 | return ok, err |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 368 | } |
| 369 | |
Abukar Mohamed | 121e8b6 | 2020-09-18 11:41:33 +0000 | [diff] [blame] | 370 | func (a *AlarmManager) StatusCB() bool { |
Mohamed Abukar | 4e7e712 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 371 | if !a.rmrReady { |
| 372 | app.Logger.Info("RMR not ready yet!") |
| 373 | } |
| 374 | |
| 375 | return a.rmrReady |
Mohamed Abukar | 3e03815 | 2020-03-04 10:01:45 +0200 | [diff] [blame] | 376 | } |
Mohamed Abukar | 540ceee | 2020-09-09 08:07:40 +0300 | [diff] [blame] | 377 | |
vipin | 541eb50 | 2020-09-22 12:04:59 +0000 | [diff] [blame] | 378 | func (a *AlarmManager) ConfigChangeCB(configparam string) { |
| 379 | |
| 380 | a.maxActiveAlarms = app.Config.GetInt("controls.maxActiveAlarms") |
Anssi Mannila | ac56b89 | 2020-11-20 14:50:00 +0200 | [diff] [blame] | 381 | if a.maxActiveAlarms == 0 { |
| 382 | a.maxActiveAlarms = 5000 |
| 383 | } |
| 384 | |
vipin | 541eb50 | 2020-09-22 12:04:59 +0000 | [diff] [blame] | 385 | a.maxAlarmHistory = app.Config.GetInt("controls.maxAlarmHistory") |
Anssi Mannila | ac56b89 | 2020-11-20 14:50:00 +0200 | [diff] [blame] | 386 | if a.maxAlarmHistory == 0 { |
| 387 | a.maxAlarmHistory = 20000 |
| 388 | } |
| 389 | |
Anssi Mannila | 4450a89 | 2020-09-25 10:24:29 +0300 | [diff] [blame] | 390 | a.alertInterval = viper.GetInt("controls.promAlertManager.alertInterval") |
| 391 | a.amHost = viper.GetString("controls.promAlertManager.address") |
vipin | 541eb50 | 2020-09-22 12:04:59 +0000 | [diff] [blame] | 392 | |
| 393 | app.Logger.Debug("ConfigChangeCB: maxActiveAlarms %v", a.maxActiveAlarms) |
| 394 | app.Logger.Debug("ConfigChangeCB: maxAlarmHistory = %v", a.maxAlarmHistory) |
Anssi Mannila | 4450a89 | 2020-09-25 10:24:29 +0300 | [diff] [blame] | 395 | app.Logger.Debug("ConfigChangeCB: alertInterval %v", a.alertInterval) |
| 396 | app.Logger.Debug("ConfigChangeCB: amHost = %v", a.amHost) |
vipin | 541eb50 | 2020-09-22 12:04:59 +0000 | [diff] [blame] | 397 | |
| 398 | return |
| 399 | } |
| 400 | |
vipin | 14323a9 | 2020-09-25 10:03:43 +0000 | [diff] [blame] | 401 | func (a *AlarmManager) ReadAlarmDefinitionFromJson() { |
| 402 | |
| 403 | filename := os.Getenv("DEF_FILE") |
| 404 | file, err := ioutil.ReadFile(filename) |
| 405 | if err == nil { |
| 406 | data := RicAlarmDefinitions{} |
| 407 | err = json.Unmarshal([]byte(file), &data) |
| 408 | if err == nil { |
| 409 | for _, alarmDefinition := range data.AlarmDefinitions { |
| 410 | _, exists := alarm.RICAlarmDefinitions[alarmDefinition.AlarmId] |
| 411 | if exists { |
| 412 | app.Logger.Error("ReadAlarmDefinitionFromJson: alarm definition already exists for %v", alarmDefinition.AlarmId) |
| 413 | } else { |
| 414 | app.Logger.Debug("ReadAlarmDefinitionFromJson: alarm %v", alarmDefinition.AlarmId) |
| 415 | ricAlarmDefintion := new(alarm.AlarmDefinition) |
| 416 | ricAlarmDefintion.AlarmId = alarmDefinition.AlarmId |
| 417 | ricAlarmDefintion.AlarmText = alarmDefinition.AlarmText |
| 418 | ricAlarmDefintion.EventType = alarmDefinition.EventType |
| 419 | ricAlarmDefintion.OperationInstructions = alarmDefinition.OperationInstructions |
Anssi Mannila | 18fd03c | 2020-10-29 10:01:00 +0200 | [diff] [blame] | 420 | ricAlarmDefintion.RaiseDelay = alarmDefinition.RaiseDelay |
| 421 | ricAlarmDefintion.ClearDelay = alarmDefinition.ClearDelay |
Mohamed Abukar | 3649fae | 2020-10-30 23:51:39 +0200 | [diff] [blame] | 422 | ricAlarmDefintion.TimeToLive = alarmDefinition.TimeToLive |
vipin | 14323a9 | 2020-09-25 10:03:43 +0000 | [diff] [blame] | 423 | alarm.RICAlarmDefinitions[alarmDefinition.AlarmId] = ricAlarmDefintion |
| 424 | } |
| 425 | } |
| 426 | } else { |
vipin | 6f73fa3 | 2020-10-06 06:51:53 +0000 | [diff] [blame] | 427 | app.Logger.Error("ReadAlarmDefinitionFromJson: json.Unmarshal failed with error %v", err) |
vipin | 14323a9 | 2020-09-25 10:03:43 +0000 | [diff] [blame] | 428 | } |
| 429 | } else { |
vipin | 6f73fa3 | 2020-10-06 06:51:53 +0000 | [diff] [blame] | 430 | app.Logger.Error("ReadAlarmDefinitionFromJson: ioutil.ReadFile failed with error %v", err) |
vipin | 14323a9 | 2020-09-25 10:03:43 +0000 | [diff] [blame] | 431 | } |
| 432 | } |
| 433 | |
vipin | 78b2b0a | 2020-10-28 10:10:18 +0000 | [diff] [blame] | 434 | func (a *AlarmManager) ReadAlarmInfoFromPersistentVolume() { |
| 435 | var alarmpersistentinfo AlarmPersistentInfo |
| 436 | byteValue, rerr := ioutil.ReadFile(a.alarmInfoPvFile) |
| 437 | if rerr != nil { |
| 438 | app.Logger.Error("ararminfo.json file read error %v", rerr) |
| 439 | } else { |
| 440 | err := json.Unmarshal(byteValue, &alarmpersistentinfo) |
| 441 | if err != nil { |
| 442 | app.Logger.Error("alarmpersistentinfo json unmarshal error %v", err) |
| 443 | } else { |
| 444 | a.uniqueAlarmId = alarmpersistentinfo.UniqueAlarmId |
| 445 | a.activeAlarms = make([]AlarmNotification, len(alarmpersistentinfo.ActiveAlarms)) |
| 446 | a.alarmHistory = make([]AlarmNotification, len(alarmpersistentinfo.AlarmHistory)) |
| 447 | copy(a.activeAlarms, alarmpersistentinfo.ActiveAlarms) |
| 448 | copy(a.alarmHistory, alarmpersistentinfo.AlarmHistory) |
| 449 | } |
| 450 | } |
| 451 | } |
| 452 | |
| 453 | func (a *AlarmManager) WriteAlarmInfoToPersistentVolume() { |
| 454 | var alarmpersistentinfo AlarmPersistentInfo |
| 455 | alarmpersistentinfo.UniqueAlarmId = a.uniqueAlarmId |
| 456 | alarmpersistentinfo.ActiveAlarms = make([]AlarmNotification, len(a.activeAlarms)) |
| 457 | alarmpersistentinfo.AlarmHistory = make([]AlarmNotification, len(a.alarmHistory)) |
Mohamed Abukar | 3649fae | 2020-10-30 23:51:39 +0200 | [diff] [blame] | 458 | |
vipin | 78b2b0a | 2020-10-28 10:10:18 +0000 | [diff] [blame] | 459 | copy(alarmpersistentinfo.ActiveAlarms, a.activeAlarms) |
| 460 | copy(alarmpersistentinfo.AlarmHistory, a.alarmHistory) |
Mohamed Abukar | 3649fae | 2020-10-30 23:51:39 +0200 | [diff] [blame] | 461 | |
vipin | 78b2b0a | 2020-10-28 10:10:18 +0000 | [diff] [blame] | 462 | wdata, err := json.MarshalIndent(alarmpersistentinfo, "", " ") |
| 463 | if err != nil { |
| 464 | app.Logger.Error("alarmpersistentinfo json marshal error %v", err) |
| 465 | } else { |
| 466 | werr := ioutil.WriteFile(a.alarmInfoPvFile, wdata, 0777) |
| 467 | if werr != nil { |
| 468 | app.Logger.Error("alarminfo.json file write error %v", werr) |
| 469 | } |
| 470 | } |
| 471 | } |
| 472 | |
Mohamed Abukar | 3649fae | 2020-10-30 23:51:39 +0200 | [diff] [blame] | 473 | func (a *AlarmManager) Run(sdlcheck bool, ttlInterval int) { |
Abukar Mohamed | 121e8b6 | 2020-09-18 11:41:33 +0000 | [diff] [blame] | 474 | app.Logger.SetMdc("alarmManager", fmt.Sprintf("%s:%s", Version, Hash)) |
Mohamed Abukar | 540ceee | 2020-09-09 08:07:40 +0300 | [diff] [blame] | 475 | app.SetReadyCB(func(d interface{}) { a.rmrReady = true }, true) |
| 476 | app.Resource.InjectStatusCb(a.StatusCB) |
vipin | 541eb50 | 2020-09-22 12:04:59 +0000 | [diff] [blame] | 477 | app.AddConfigChangeListener(a.ConfigChangeCB) |
Mohamed Abukar | 540ceee | 2020-09-09 08:07:40 +0300 | [diff] [blame] | 478 | |
vipin | 54a3a4f | 2020-09-23 12:19:58 +0000 | [diff] [blame] | 479 | alarm.RICAlarmDefinitions = make(map[int]*alarm.AlarmDefinition) |
vipin | 14323a9 | 2020-09-25 10:03:43 +0000 | [diff] [blame] | 480 | a.ReadAlarmDefinitionFromJson() |
vipin | 54a3a4f | 2020-09-23 12:19:58 +0000 | [diff] [blame] | 481 | |
Mohamed Abukar | 3649fae | 2020-10-30 23:51:39 +0200 | [diff] [blame] | 482 | a.InjectRoutes() |
Mohamed Abukar | 540ceee | 2020-09-09 08:07:40 +0300 | [diff] [blame] | 483 | |
| 484 | // Start background timer for re-raising alerts |
Mohamed Abukar | 540ceee | 2020-09-09 08:07:40 +0300 | [diff] [blame] | 485 | go a.StartAlertTimer() |
Mohamed Abukar | 3649fae | 2020-10-30 23:51:39 +0200 | [diff] [blame] | 486 | go a.StartTTLTimer(ttlInterval) |
| 487 | |
vipin | 541eb50 | 2020-09-22 12:04:59 +0000 | [diff] [blame] | 488 | a.alarmClient, _ = alarm.InitAlarm("SEP", "ALARMMANAGER") |
Mohamed Abukar | 540ceee | 2020-09-09 08:07:40 +0300 | [diff] [blame] | 489 | |
vipin | 78b2b0a | 2020-10-28 10:10:18 +0000 | [diff] [blame] | 490 | a.ReadAlarmInfoFromPersistentVolume() |
| 491 | |
Mohamed Abukar | 540ceee | 2020-09-09 08:07:40 +0300 | [diff] [blame] | 492 | app.RunWithParams(a, sdlcheck) |
| 493 | } |
| 494 | |
Mohamed Abukar | 105030f | 2020-10-22 18:08:34 +0300 | [diff] [blame] | 495 | func NewAlarmManager(amHost string, alertInterval int, clearAlarm bool) *AlarmManager { |
Mohamed Abukar | 540ceee | 2020-09-09 08:07:40 +0300 | [diff] [blame] | 496 | if alertInterval == 0 { |
| 497 | alertInterval = viper.GetInt("controls.promAlertManager.alertInterval") |
| 498 | } |
| 499 | |
| 500 | if amHost == "" { |
| 501 | amHost = viper.GetString("controls.promAlertManager.address") |
| 502 | } |
| 503 | |
Anssi Mannila | ac56b89 | 2020-11-20 14:50:00 +0200 | [diff] [blame] | 504 | maxActiveAlarms := app.Config.GetInt("controls.maxActiveAlarms") |
| 505 | if maxActiveAlarms == 0 { |
| 506 | maxActiveAlarms = 5000 |
| 507 | } |
| 508 | |
| 509 | maxAlarmHistory := app.Config.GetInt("controls.maxAlarmHistory") |
| 510 | if maxAlarmHistory == 0 { |
| 511 | maxAlarmHistory = 20000 |
| 512 | } |
| 513 | |
Abukar Mohamed | 121e8b6 | 2020-09-18 11:41:33 +0000 | [diff] [blame] | 514 | return &AlarmManager{ |
Mohamed Abukar | f5a8e71 | 2020-10-19 16:58:17 +0300 | [diff] [blame] | 515 | rmrReady: false, |
Mohamed Abukar | 105030f | 2020-10-22 18:08:34 +0300 | [diff] [blame] | 516 | postClear: clearAlarm, |
Mohamed Abukar | f5a8e71 | 2020-10-19 16:58:17 +0300 | [diff] [blame] | 517 | amHost: amHost, |
Mohamed Abukar | 105030f | 2020-10-22 18:08:34 +0300 | [diff] [blame] | 518 | amBaseUrl: app.Config.GetString("controls.promAlertManager.baseUrl"), |
| 519 | amSchemes: []string{app.Config.GetString("controls.promAlertManager.schemes")}, |
Mohamed Abukar | f5a8e71 | 2020-10-19 16:58:17 +0300 | [diff] [blame] | 520 | alertInterval: alertInterval, |
Mohamed Abukar | 105030f | 2020-10-22 18:08:34 +0300 | [diff] [blame] | 521 | activeAlarms: make([]AlarmNotification, 0), |
| 522 | alarmHistory: make([]AlarmNotification, 0), |
| 523 | uniqueAlarmId: 0, |
Anssi Mannila | ac56b89 | 2020-11-20 14:50:00 +0200 | [diff] [blame] | 524 | maxActiveAlarms: maxActiveAlarms, |
| 525 | maxAlarmHistory: maxAlarmHistory, |
vipin | 6f73fa3 | 2020-10-06 06:51:53 +0000 | [diff] [blame] | 526 | exceededActiveAlarmOn: false, |
| 527 | exceededAlarmHistoryOn: false, |
vipin | 78b2b0a | 2020-10-28 10:10:18 +0000 | [diff] [blame] | 528 | alarmInfoPvFile: app.Config.GetString("controls.alarmInfoPvFile"), |
Mohamed Abukar | 540ceee | 2020-09-09 08:07:40 +0300 | [diff] [blame] | 529 | } |
| 530 | } |
| 531 | |
| 532 | // Main function |
| 533 | func main() { |
Mohamed Abukar | 3649fae | 2020-10-30 23:51:39 +0200 | [diff] [blame] | 534 | NewAlarmManager("", 0, true).Run(true, 10) |
Mohamed Abukar | 540ceee | 2020-09-09 08:07:40 +0300 | [diff] [blame] | 535 | } |