<?xml version="1.0" encoding="UTF-8"?>
<!-- generator="FeedCreator 1.8" -->
<?xml-stylesheet href="http://3.86.49.49/lib/exe/css.php?s=feed" type="text/css"?>
<rdf:RDF
    xmlns="http://purl.org/rss/1.0/"
    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
    xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
    xmlns:dc="http://purl.org/dc/elements/1.1/">
    <channel rdf:about="http://3.86.49.49/feed.php">
        <title>Internal Errigal Collaboration Wiki resolution_area:prometheus_resolutions</title>
        <description></description>
        <link>http://3.86.49.49/</link>
        <image rdf:resource="http://3.86.49.49/lib/tpl/docnavwiki/images/favicon.ico" />
       <dc:date>2026-04-17T17:50:54+00:00</dc:date>
        <items>
            <rdf:Seq>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1100&amp;rev=1624612196&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1101&amp;rev=1625480131&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1102&amp;rev=1639743457&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1103&amp;rev=1625480755&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1104&amp;rev=1625486183&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1105&amp;rev=1640272754&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1106&amp;rev=1625568627&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1107&amp;rev=1624612196&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1108&amp;rev=1720512891&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1109&amp;rev=1639746597&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1110&amp;rev=1625481084&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1111&amp;rev=1640269261&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1112&amp;rev=1624612196&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1113&amp;rev=1639054896&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1114&amp;rev=1627657471&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1115&amp;rev=1665393402&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1116&amp;rev=1640016373&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1117&amp;rev=1640014226&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1118&amp;rev=1640020669&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1201&amp;rev=1625486213&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1202&amp;rev=1625484774&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1203&amp;rev=1625481076&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1204&amp;rev=1640096350&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1301&amp;rev=1640083700&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1302&amp;rev=1625481747&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1401&amp;rev=1671449602&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1402&amp;rev=1640085000&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1403&amp;rev=1625488615&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1404&amp;rev=1640084942&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1405&amp;rev=1625572530&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1406&amp;rev=1639736850&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1407&amp;rev=1664538774&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1408&amp;rev=1716286322&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1501&amp;rev=1629495238&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1502&amp;rev=1625482559&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1503&amp;rev=1640084828&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1504&amp;rev=1625494582&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1505&amp;rev=1624612196&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1601&amp;rev=1625572026&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1602&amp;rev=1640267692&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1603&amp;rev=1625483114&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1701&amp;rev=1625483911&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1702&amp;rev=1624612196&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1703&amp;rev=1625497332&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1704&amp;rev=1625512082&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1705&amp;rev=1625482265&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1706&amp;rev=1639674096&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1707&amp;rev=1625483548&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1708&amp;rev=1625484458&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1801&amp;rev=1624612196&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1802&amp;rev=1624612196&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1803&amp;rev=1624612196&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1804&amp;rev=1625482764&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1805&amp;rev=1640268218&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1806&amp;rev=1697616008&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1807&amp;rev=1625495417&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1808&amp;rev=1634303426&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1809&amp;rev=1651588126&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1901&amp;rev=1625512687&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p2000&amp;rev=1655812430&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9109&amp;rev=1632230332&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9110&amp;rev=1627657609&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9111&amp;rev=1627657659&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9112&amp;rev=1627657675&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9113&amp;rev=1631021988&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9114&amp;rev=1693844014&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9115&amp;rev=1649331217&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9116&amp;rev=1649331037&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9117&amp;rev=1652187299&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9118&amp;rev=1671811342&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9119&amp;rev=1668611113&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9120&amp;rev=1669028116&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9121&amp;rev=1671116386&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9124&amp;rev=1707292769&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9125&amp;rev=1698756573&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9126&amp;rev=1685114685&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9127&amp;rev=1682585954&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9128&amp;rev=1699616416&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9129&amp;rev=1723138988&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9130&amp;rev=1726735246&amp;do=diff"/>
                <rdf:li rdf:resource="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9131&amp;rev=1732805578&amp;do=diff"/>
            </rdf:Seq>
        </items>
    </channel>
    <image rdf:about="http://3.86.49.49/lib/tpl/docnavwiki/images/favicon.ico">
        <title>Internal Errigal Collaboration Wiki</title>
        <link>http://3.86.49.49/</link>
        <url>http://3.86.49.49/lib/tpl/docnavwiki/images/favicon.ico</url>
    </image>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1100&amp;rev=1624612196&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-06-25T10:09:56+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1100</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1100&amp;rev=1624612196&amp;do=diff</link>
        <description>Alert

Level:

Purpose:

Scenario:

Resolution:

Manual Action Steps:

Auto Clear:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1101&amp;rev=1625480131&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T11:15:31+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1101</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1101&amp;rev=1625480131&amp;do=diff</link>
        <description>AppDown

Level:   Critical     FIXME

Purpose: To monitor application status and alert if it is not responsive

Scenario: &lt;application&gt; on &lt;server&gt; has been down for more than 120s.

Resolution: Restart the application

Manual Action Steps: If the application is one of the grails apps then use the start/stop.sh scripts in the</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1102&amp;rev=1639743457&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-12-17T12:17:37+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1102</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1102&amp;rev=1639743457&amp;do=diff</link>
        <description>TicketerEmailFailedDelivery

Level: Citical FIXME

Purpose:
If an email that was supposed to be send by the application could not be sent, it will end up in the dead email queue. Support must try to send the email again by taking it from the dead email queue, checking it for errors and then resending it in the normal email queue with persistent set in the dropdown.
See Manual Action Steps</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1103&amp;rev=1625480755&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T11:25:55+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1103</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1103&amp;rev=1625480755&amp;do=diff</link>
        <description>RemoteTicketFailedToCreate

Level: Critical FIXME

Purpose:
Reports if a remote ticket creation has failed on RabbitMQ. If the ticket fails to create, it will be found in the remote.ticket.dead.letter.queue on RMQ. 

Scenario: A remote ticket has failed to create. This can happen for a number of reasons. The exception can be found when getting the message from the dead letter queue. Steps can be found in the below link. The most common reason is that the ticket is being created on an orphan elem…</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1104&amp;rev=1625486183&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T12:56:23+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1104</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1104&amp;rev=1625486183&amp;do=diff</link>
        <description>AlarmCacheQueueIncreasing

Level: Critical

Purpose: Identify when the RabbitMQ alarm_cache_inbound_queue is increasing.

Scenario: There have been &gt;= 5000 messages in the alarm cache queue for 120s. This means the RabbitMQ is not processing these messages fast enough.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1105&amp;rev=1640272754&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-12-23T15:19:14+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1105</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1105&amp;rev=1640272754&amp;do=diff</link>
        <description>BlackboxExporterProbeFail

Level: minor 

Purpose: Alerts operations that the blackbox exporter probe has failed.

Scenario: The blackbox exporter probe used for detecting SSL cert expiry has failed for 10 minutes.

Resolution:
Check if service is running</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1106&amp;rev=1625568627&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-06T11:50:27+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1106</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1106&amp;rev=1625568627&amp;do=diff</link>
        <description>StompQueueNotDeleted

Level: Critical

Purpose:
The EMS will create a STOMP queue in RabbitMQ where handles the updates for the EMS Frontend GUI. The queue should live for as long as the EMs tab exists. 

This queue binds to the &#039;ems_push_notification_topic&#039; exchange with the routing key &#039;AlarmCacheUpdate&#039; so anything pushed onto this exchange with that binding key the frontend will obtain. The frontend takes the Alarm Cache Update Message and converts it to an AlarmCacheDTO (GWT JSON to Java Ob…</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1107&amp;rev=1624612196&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-06-25T10:09:56+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1107</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1107&amp;rev=1624612196&amp;do=diff</link>
        <description>RebootRequired

Level: Warning :?:

Purpose:

Scenario: &lt;server&gt; requires a reboot.

Resolution:

Manual Action Steps:

Auto Clear:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1108&amp;rev=1720512891&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2024-07-09T09:14:51+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1108</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1108&amp;rev=1720512891&amp;do=diff</link>
        <description>ClockSkewDetected

Level: Warning :?:

Purpose: To let us know if there is a difference between the machine clock and they system clock

Scenario: Clock skew detected on &lt;server&gt; for 120s. Ensure NTP is configured correctly on this host.

Resolution:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1109&amp;rev=1639746597&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-12-17T13:09:57+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1109</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1109&amp;rev=1639746597&amp;do=diff</link>
        <description>SystemdServiceCrashed

Level: Warning

Purpose:
To ensure the services that are supposed to be running stay running

Scenario: SystemD service has crashed for 5m.

Resolution:
The service is running in a stable state

Manual Action Steps:

	*  ssh onto the affected server.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1110&amp;rev=1625481084&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T11:31:24+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1110</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1110&amp;rev=1625481084&amp;do=diff</link>
        <description>PhysicalComponentTooHot

Level: Warning :!:

Purpose:
Reports on the physical hardware temperature on the servers.

Scenario: A physical component is operating outside of a safe temperature (&gt; 75 degrees celsius) for 5m.

Resolution:
Monitor the server and alerts to see if there are any process running that shouldn&#039;t be running or are taking too long. Check RAM usage.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1111&amp;rev=1640269261&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-12-23T14:21:01+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1111</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1111&amp;rev=1640269261&amp;do=diff</link>
        <description>Watchdog (#)

Level: Critical :?:

Purpose: Alerts operations that the watchdog agent is not running on a server this can lead to support missing critical alerts.

Scenario:
When a Watchdog cannot start it will generate the Watchdog Agent alert.
There is an active alarm on the Watchdog Agent.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1112&amp;rev=1624612196&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-06-25T10:09:56+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1112</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1112&amp;rev=1624612196&amp;do=diff</link>
        <description>NodeOvertemperatureAlarm

Level: Critical FIXME

Purpose:

Scenario: A node has been too hot for 5m and has triggered a temperature alarm.

Resolution:

Manual Action Steps:

Auto Clear:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1113&amp;rev=1639054896&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-12-09T13:01:36+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1113</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1113&amp;rev=1639054896&amp;do=diff</link>
        <description>AppDown

Level:

	*  KlaDigiModemNon200_30min - Warning 
		*  KlaDigiModemNon200_60min - Minor

Purpose: Alert that the KLA Digi Modem which is our gateway into KLA Site is unavailable

Scenario: DigiModem has gone off line on the site listed. This could be scheduled or unscheduled maintenance</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1114&amp;rev=1627657471&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-30T16:04:31+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1114</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1114&amp;rev=1627657471&amp;do=diff</link>
        <description>NewScottyProP1Ticket

Level: critical

Purpose: Notify operations that a P1 ticket has been received from a customer in ScottyPro.

Scenario: Alerts when there is a P1 ticket from any customer in Triage. Query checks for P1 tickets in Triage that were created in the previous 12 hours.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1115&amp;rev=1665393402&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2022-10-10T10:16:42+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1115</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1115&amp;rev=1665393402&amp;do=diff</link>
        <description>KlaDigiModemSitesDown

Level:
Critical

Purpose: Alert that over 90% of all the KLA Digi Modems, which are our gateway into KLA Site, are unavailable. This is the count for the number of Digi Modems that are offline.

Scenario: 90% of DigiModems have gone off line.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1116&amp;rev=1640016373&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-12-20T16:06:13+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1116</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1116&amp;rev=1640016373&amp;do=diff</link>
        <description>AppDown: CAS Out Of Memory

Level:

	*  CASOutOfMemory - Critical 

Purpose: When CAS reports in the logs that is running out of memory or already has run out. 

Scenario: The CAS logs printed: 

java.lang.OutOfMemoryError: unable to create new native thread</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1117&amp;rev=1640014226&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-12-20T15:30:26+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1117</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1117&amp;rev=1640014226&amp;do=diff</link>
        <description>AppDown: Node Monitor

Level:

	*  NodeMonitorAccidentallyInActive - Critical 

Purpose: When we perform an application deployment for a new version but we have accidentally disabled the Node Monitor for a customer that is still using the Node Monitor.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1118&amp;rev=1640020669&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-12-20T17:17:49+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1118</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1118&amp;rev=1640020669&amp;do=diff</link>
        <description>Sub Case Reason N/A

Level:

	*   - Major 

Purpose: Here and there, there are going to traps that don&#039;t have a reason value assigned for the Ticketer Integration. This checks the logs for that because ATC get annoyed by it and may send a ticket. 

Scenario:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1201&amp;rev=1625486213&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T12:56:53+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1201</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1201&amp;rev=1625486213&amp;do=diff</link>
        <description>HeapLow

Level: Critical

Purpose:
This alert was created to identify applications where the allocated memory or heap of the JVM was running low. This can happen for a number of reasons.
There may be: 
- blocking event.
- an excessive amount of data being generated by or pushed to the application.
- RAM running low on the server.
- A script that is holding a lot of data in memory.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1202&amp;rev=1625484774&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T12:32:54+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1202</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1202&amp;rev=1625484774&amp;do=diff</link>
        <description>HeapLowGrails

Level: Critical FIXME

Purpose: Identify when one of the Errigal Grails Application is using an excessive amount of heap memory for a defined duration of time.

Scenario: ReportingManager on EXTApps2 used more than 90% of its Heap for 120s. Typical reason in this instance would be a rogue report run with very broad input variable e.g a trap report specifying range of over year.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1203&amp;rev=1625481076&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T11:31:16+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1203</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1203&amp;rev=1625481076&amp;do=diff</link>
        <description>RAMLow

Level: Major :!:

Purpose: Alerts us when an application is using more RAM than expected

Scenario: Free RAM on &lt;application&gt; has been less than 10% for 10m.

Resolution: Review how much ram is used by the application and whether more is required.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1204&amp;rev=1640096350&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-12-21T14:19:10+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1204</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1204&amp;rev=1640096350&amp;do=diff</link>
        <description>HeapLowSpringBoot

Level: Critical FIXME

Purpose: Identify when one of the Errigal Spring Boot Applications is using an excessive amount of heap memory for a defined duration of time.

Scenario: Userprofile is reporting high heap. This could be caused by a number of things, including excessive user activity/updates.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1301&amp;rev=1640083700&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-12-21T10:48:20+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1301</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1301&amp;rev=1640083700&amp;do=diff</link>
        <description>CpuHigh

Level: Major :!:

Purpose:

Scenario: CPU on &lt;server&gt; has been over 70% for 5m.

Resolution:

Manual Action Steps:
If this has been running for a while.

SSH onto the server ssh scotty@server.err

Use top

Just ensure that its nothing suspicious.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1302&amp;rev=1625481747&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T11:42:27+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1302</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1302&amp;rev=1625481747&amp;do=diff</link>
        <description>CriticalCPULoad

Level: Critical FIXME

Purpose:
The alert reports if the CPU usage is above 96% for more that 2 minutes on one of the servers.

Scenario:
The CPU has been over 96% for more than 2 minutes.

Resolution:
Monitor the server and alerts. Check the RAM usage on the server. Check processes to see if anything is running that shouldn&#039;t be. Check the Grafana metrics for CPU percentage for the last few days for any patterns. This could be caused by a process that runs at the same time ever…</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1401&amp;rev=1671449602&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2022-12-19T11:33:22+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1401</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1401&amp;rev=1671449602&amp;do=diff</link>
        <description>DiskSpaceLow

Level: Major :!:

Purpose: Identify when there is only 20% or 10% of disk space left

Scenario: Major - Disk usage on &lt;server&gt; has been more than 90% for 5m.  
Minor - Disk usage on &lt;server&gt; has been more than 80% for 5m.

Resolution: Check free space on server: “df -h” &amp; remove any large temporary files</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1402&amp;rev=1640085000&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-12-21T11:10:00+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1402</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1402&amp;rev=1640085000&amp;do=diff</link>
        <description>CriticalDiskSpace

Level: Critical FIXME

Purpose: To ensure the server doesn&#039;t crash due to a filled disk.

Scenario: &lt;server&gt; of job &lt;application&gt; has had less than 10% space remaining for 4m.

Resolution: More than 10% of the disk is free.

Manual Action Steps:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1403&amp;rev=1625488615&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T13:36:55+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1403</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1403&amp;rev=1625488615&amp;do=diff</link>
        <description>DiskWillFillIn4Hours

Level: Warning

Purpose:
This is a common alert which to notify operations if there is a server writing to the disk too much.

Scenario: Disk will fill in 4 hours based on current write rate in past 5m. 

Resolution:
Very commonly this alert will clear itself in a few moments but it is important to be wary of it as a filled disk can be very difficult to recover from.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1404&amp;rev=1640084942&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-12-21T11:09:02+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1404</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1404&amp;rev=1640084942&amp;do=diff</link>
        <description>UnusualDiskReadLatency

Level: Warning

Purpose:

Scenario: Disk read operations taking &gt;100ms for 5m.

Resolution: You could check if there is a process doing large reads or writes and check the status of it. Check the health of the disk, if it&#039;s damaged then it may need replacement. Also check storage, the disk may be filling up.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1405&amp;rev=1625572530&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-06T12:55:30+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1405</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1405&amp;rev=1625572530&amp;do=diff</link>
        <description>UnusualDiskWriteLatency

Level: Warning

Purpose: This alert tells us that the disk write latency is growing.

Scenario: Disk write operations taking &gt;100ms for 5m.

Resolution: You could check if there is a process doing large writes and check the status of it. Check the health of the disk, if it&#039;s damaged then it may need replacement. Also check storage, the disk may be filling up.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1406&amp;rev=1639736850&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-12-17T10:27:30+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1406</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1406&amp;rev=1639736850&amp;do=diff</link>
        <description>SwapIsFillingUp

Level: Warning

Purpose:
This alert is generally one that will alarm and clear, but is is important to be aware of high swap use. If the swap fills up the kernal will start to kill processes and that will lead to app downs or a complete shutdown.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1407&amp;rev=1664538774&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2022-09-30T12:52:54+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1407</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1407&amp;rev=1664538774&amp;do=diff</link>
        <description>LogFileSizeIncreasing

Level: Major/Critical

Purpose: Notify operations that a log file on the server has a rapidly growing log line count

Scenario: An error has occurred in an application and is appending the log at a rapid rate, filling up the disk space.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1408&amp;rev=1716286322&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2024-05-21T11:12:02+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1408</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1408&amp;rev=1716286322&amp;do=diff</link>
        <description>KLA NUC DF 60/80

Level: Major/Critical

Purpose: Notify operations that a KLA NUC has disk space full greater than 60 or 80 percent

Resolution: Take the entry point ID in the alert, and search for this ID in the DB.

select * from snmp_manager.network_element where id = &lt;ID&gt;</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1501&amp;rev=1629495238&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-08-20T22:33:58+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1501</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1501&amp;rev=1629495238&amp;do=diff</link>
        <description>SSLCertExpiring

Level: minor 

Purpose:
The alert checks the SSL cert for a URL and reports if the cert will expire within 90 days.

Scenario: The SSL Certificate for the website will expire in 90 days.

Resolution:
Renew the SSL. All our websites that end in *.errigal.com use a GoDaddy wildcard cert. Check</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1502&amp;rev=1625482559&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T11:55:59+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1502</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1502&amp;rev=1625482559&amp;do=diff</link>
        <description>SSLCertExpiringSoon

Level: Major :!:

Purpose: The alert checks the SSL cert for a URL and reports if the cert will expire within 30 days.

Scenario: The SSL Certificate for website will expire in 30 days.

Resolution:
Renew the SSL. All our websites that end in *.errigal.com use a GoDaddy wildcard cert. Check</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1503&amp;rev=1640084828&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-12-21T11:07:08+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1503</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1503&amp;rev=1640084828&amp;do=diff</link>
        <description>SSLCertExpiringImmediately

Level: Critical FIXME

Purpose:

Scenario: The SSL Certificate for &lt;server&gt; will expire in 7 days.

Resolution:
You shouldn&#039;t have to do anything for this 
Renew the SSL. All our websites that end in *.errigal.com use a GoDaddy wildcard cert. Check</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1504&amp;rev=1625494582&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T15:16:22+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1504</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1504&amp;rev=1625494582&amp;do=diff</link>
        <description>LetsEncryptSSLCertExpiringSoon

Level: Major

Purpose:
The alert checks the SSL cert for a URL and reports if the cert will expire within 90 days.

Scenario:
The SSL Certificate for the &lt;server&gt;  will expire in 30 days. Monitor it to verify it auto renews properly</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1505&amp;rev=1624612196&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-06-25T10:09:56+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1505</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1505&amp;rev=1624612196&amp;do=diff</link>
        <description>LetsEncryptSSLCertExpiringImmediately

Level: Critical FIXME

Purpose:

Scenario: The SSL Certificate for &lt;server&gt; will expire in 7 days. This should have been renewed by Lets Encrypt.

Resolution:

Manual Action Steps:

Auto Clear:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1601&amp;rev=1625572026&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-06T12:47:06+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1601</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1601&amp;rev=1625572026&amp;do=diff</link>
        <description>UnusualNetworkThroughputIn

Level: Warning

Purpose: This alert tells us when network interfaces may be receiving too much data

Scenario: Unusual network throughput in &lt;server&gt; for 5m.

Resolution: You could run a TCP dump on the interface to view the traffic and if everything seems normal then our threshold may need updating. If things don&#039;t look normal then you would need to investigate the traffic and where its coming from.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1602&amp;rev=1640267692&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-12-23T13:54:52+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1602</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1602&amp;rev=1640267692&amp;do=diff</link>
        <description>UnusualNetworkThroughputOut

Level: Warning

Purpose:
To notify of any unusual behaviour which could signify potential issues such as perhaps a security breach?

Scenario: Unusual network throughput out &lt;server&gt; for 5m.

Resolution:
Check the network throughput per process:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1603&amp;rev=1625483114&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T12:05:14+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1603</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1603&amp;rev=1625483114&amp;do=diff</link>
        <description>OutOfInodes

Level: Warning

Purpose:
Reports when a servers inodes are running low.

Scenario: Disk has had less than 10% of inodes left for 5m.

Resolution:
Check if there are any old log files on the system to remove.
sudo du -sxh /* 2&gt;/dev/null | sort -h</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1701&amp;rev=1625483911&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T12:18:31+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1701</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1701&amp;rev=1625483911&amp;do=diff</link>
        <description>MysqlDown

Level: Critical

Purpose: Ensure that MySQL is running on the server.

Scenario: MySQL instance has been down on server for 5m.

Resolution: Start Mysql if it is not running

Check Mysql status
sudo service mysqld status
Manual Action Steps:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1702&amp;rev=1624612196&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-06-25T10:09:56+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1702</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1702&amp;rev=1624612196&amp;do=diff</link>
        <description>MysqlTooManyConnections

Level: Warning

Purpose:

Scenario: More than 80% of MySQL connections have been for 5m.

Resolution:

Manual Action Steps:

Auto Clear:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1703&amp;rev=1625497332&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T16:02:12+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1703</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1703&amp;rev=1625497332&amp;do=diff</link>
        <description>MysqlHighThreadsRunning

Level: Warning

Purpose:
This alert is to ensure that the applications aren&#039;t doing to much with the Database at the same time. If a thread is slow or stuck in a running state then we must investigate.

Scenario: More than 60% of MySQL connections have been in a running state on &lt;server&gt; for 5m.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1704&amp;rev=1625512082&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T20:08:02+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1704</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1704&amp;rev=1625512082&amp;do=diff</link>
        <description>MysqlSlaveIoThreadNotRunning

Level: Critical FIXME

Purpose: Alert Errigal engineers to a problem with replication to the secondary database(s). Replication databases are used for Reporting Applications, issues with replication mean the data becomes out of date and customer reports will be incorrect.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1705&amp;rev=1625482265&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T11:51:05+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1705</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1705&amp;rev=1625482265&amp;do=diff</link>
        <description>MysqlSlaveSqlThreadNotRunning

Level: Critical FIXME

Purpose: This alert will tell us that replication is not running on a replica DB

Scenario: MySQL Slave SQL thread has not been running on &lt;server&gt; for 5m .

Resolution: Check the replica for errors with a query.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1706&amp;rev=1639674096&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-12-16T17:01:36+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1706</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1706&amp;rev=1639674096&amp;do=diff</link>
        <description>MysqlSlaveReplicationLag

Level: Warning

Purpose:

Scenario: This can occur when there are additional processes running on the DB servers e.g key_stats or the database backups. Processes like these typically lock rows and cause replication to lag or fail.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1707&amp;rev=1625483548&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T12:12:28+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1707</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1707&amp;rev=1625483548&amp;do=diff</link>
        <description>MysqlSlowQueries (#)

Level: Warning

Purpose:
Reports when queries are running slow in the database.

Scenario: MySQL server has been having some slow queries for 5m.

Resolution:
Log into MySQL as a root or writer account. Check the running queries by typing:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1708&amp;rev=1625484458&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T12:27:38+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1708</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1708&amp;rev=1625484458&amp;do=diff</link>
        <description>MysqlRestarted

Level: Warning

Purpose: Identify if an unexpected Mysql restart occurred.

Scenario: MySQL restarted less than one minute ago on &lt;server&gt;.

Resolution: Verify that Mysql is running and that you can run a simple query.

Manual Action Steps:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1801&amp;rev=1624612196&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-06-25T10:09:56+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1801</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1801&amp;rev=1624612196&amp;do=diff</link>
        <description>RabbitmqDown

Level: Critical FIXME

Purpose:

Scenario: Rabbitmq node has been down for 5m.

Resolution:

Manual Action Steps:

Auto Clear:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1802&amp;rev=1624612196&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-06-25T10:09:56+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1802</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1802&amp;rev=1624612196&amp;do=diff</link>
        <description>RabbitMQOutOfMemory

Level: Warning :!:

Purpose:

Scenario: Less than 10% of memory has been available for RabbitMQ for 5m.

Resolution:

Manual Action Steps:

Auto Clear:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1803&amp;rev=1624612196&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-06-25T10:09:56+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1803</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1803&amp;rev=1624612196&amp;do=diff</link>
        <description>RabbitmqInstancesDifferentVersions

Level: Critical FIXME

Purpose: Can lead to failure.

Scenario: Running different version of Rabbitmq in the same cluster for 1h.

Resolution:

Manual Action Steps:

Auto Clear:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1804&amp;rev=1625482764&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T11:59:24+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1804</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1804&amp;rev=1625482764&amp;do=diff</link>
        <description>RabbitmqMemoryHigh

Level: Critical FIXME

Purpose: To alert us when rabbitmq is using more memory than expected

Scenario: A node used more than 90% of allocated RAM for 2m.

Resolution: Investigate the RAM allocations and see if it is enough.

Manual Action Steps:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1805&amp;rev=1640268218&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-12-23T14:03:38+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1805</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1805&amp;rev=1640268218&amp;do=diff</link>
        <description>RabbitmqFileDescriptorsUsage

Level: Critical FIXME

Purpose: Ensure the limits are not reached to ensure normal operation.

Scenario: A node use more than 90% of file descriptors for 2m.

Resolution:
The usage goes below 90% of the available file descriptors.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1806&amp;rev=1697616008&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2023-10-18T09:00:08+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1806</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1806&amp;rev=1697616008&amp;do=diff</link>
        <description>RabbitmqTooMuchUnack

Level: Critical FIXME

Purpose:
Alerts if a queue in RabbitMQ has too many unacknowledged messages

Scenario: There have been &gt;1000 unacknowledged messages in a RabbitMQ queue for 1m.

Resolution:
Each queue should have a consumer. Check whether the consumer is still running or check the RabbitMQ logs for any connection errors.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1807&amp;rev=1625495417&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T15:30:17+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1807</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1807&amp;rev=1625495417&amp;do=diff</link>
        <description>RabbitmqTooMuchConnections

Level: Critical FIXME

Purpose: Alert when a node has too many connections

Scenario: There have been &gt;1000 connections to a single node for 2m.

Resolution: Reduce RabbitMQ connections

Manual Action Steps: See &lt;https://www.rabbitmq.com/networking.html#dealing-with-high-connection-churn&gt;

Auto Clear:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1808&amp;rev=1634303426&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-10-15T14:10:26+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1808</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1808&amp;rev=1634303426&amp;do=diff</link>
        <description>RabbitmqUnroutableMessages

Level: Critical FIXME

Purpose:
This alert indicates that an exchange does not have a consumer over the given period.

Scenario: A queue has had unroutable messages for 2m.

For example, when the exchange=“outer_orchestrator_trap_exchange</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1809&amp;rev=1651588126&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2022-05-03T15:28:46+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1809</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1809&amp;rev=1651588126&amp;do=diff</link>
        <description>RabbitmqTooManyMessagesInQueue

This could still use more additions

Level: Critical

Purpose:
This alert was made so that RabbitMQ doesn&#039;t fill up. It is usually indicative of something going wrong with the message processing from one of the applications which consumed the queue.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1901&amp;rev=1625512687&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-05T20:18:07+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p1901</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p1901&amp;rev=1625512687&amp;do=diff</link>
        <description>EtcdServerHasNoLeader

Level: Critical FIXME

Purpose: Scheduled discoveries utilised in the MDC rely on ETCD to manage the process of leader election i.e which RDF Agent does the scheduled discovery. If ETCD is not running, neither will do the discovery, which is why this alert is important.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p2000&amp;rev=1655812430&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2022-06-21T12:53:50+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p2000</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p2000&amp;rev=1655812430&amp;do=diff</link>
        <description>MaliciousFilesFound

Level: Critical FIXME

Purpose: After a recent DOS attack, some executable malicious files were found on servers, namely shiro.sh and loudscream. This alert will fire if any of these files are found on the server with executable permission.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9109&amp;rev=1632230332&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-09-21T14:18:52+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9109</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9109&amp;rev=1632230332&amp;do=diff</link>
        <description>MnetUnclearedCriticalMajorActiveAlarms

Level: Major

Purpose: Notify operations that automated mNET groovlet that clears major &amp; critical has failed to run. Uncleared active alarms will prevent new mNET tickets from been created.

Scenario: There are uncleared mNET critical or major active alarms, this can happen when an excessive amount of mNET traps are received over a short period of time. It can also happen when the in memory remote ticket scheduler restarts automatically.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9110&amp;rev=1627657609&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-30T16:06:49+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9110</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9110&amp;rev=1627657609&amp;do=diff</link>
        <description>MinorMnetControllerTrapCount

Level: Warning :?:

Purpose: Notify operations that the number of mNET traps from a controller is higher than normal

Scenario: Greater than 1500 mNET traps have been received from a mNET Controller in the last hour.
The count query is run every 30 minutes and returns the top 5 mNET traps.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9111&amp;rev=1627657659&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-30T16:07:39+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9111</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9111&amp;rev=1627657659&amp;do=diff</link>
        <description>MajorMnetControllerTrapCount

Level: Major

Purpose: Notify operations that the number of mNET traps from a controller is high and may affect Snmp Manager Performance

Scenario: Greater than 3000 mNET traps have been received from a mNET Controller in the last hour.
The count query is run every 30 minutes and returns the top 5 mNET traps.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9112&amp;rev=1627657675&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-07-30T16:07:55+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9112</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9112&amp;rev=1627657675&amp;do=diff</link>
        <description>CriticalMnetControllerTrapCount

Level: Critical

Purpose: Notify operations that the number of mNET traps from a controller is very high and will affect Snmp Manager Performance

Scenario: Greater than 6000 mNET traps have been received from a mNET Controller in the last hour.
The count query is run every 30 minutes and returns the top 5 mNET traps.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9113&amp;rev=1631021988&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2021-09-07T14:39:48+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9113</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9113&amp;rev=1631021988&amp;do=diff</link>
        <description>MnetNoTrapsReceived

Level: Major

Purpose: Notify operations no mNET traps have been received in the last couple of hours.

Scenario: No mNET traps have been received in the previous 4 hours.

Resolution:
Verify that the OpenVPN client is running on the ExteNet load balancer. If you can confirm that OpenVPN client is running inform ExteNet that no mNET traps have been received.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9114&amp;rev=1693844014&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2023-09-04T17:13:34+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9114</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9114&amp;rev=1693844014&amp;do=diff</link>
        <description>AtcDasTicketMissingSalesForceLink

Level: Major

Purpose: Notify operations that an ATC Das ticket was not created in Salesforce

Scenario: More than one ATC DAS ticket was not created in Salesforce for the previous 7 days.

Resolution: After the 3.21 release a</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9115&amp;rev=1649331217&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2022-04-07T12:33:37+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9115</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9115&amp;rev=1649331217&amp;do=diff</link>
        <description>ApacheServiceInactive

Level:   Critical     FIXME

Purpose: To check if the Apache httpd service is running. Uses the prometheus apache-exporter service

Scenario: The Apache httpd service is not running.

Resolution: Restart httpd service and check logs</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9116&amp;rev=1649331037&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2022-04-07T12:30:37+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9116</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9116&amp;rev=1649331037&amp;do=diff</link>
        <description>ApacheSSLError

Level:   Critical     FIXME

Purpose: To check the Apache httpd error log for SSL errors using string “Unable to configure RSA server private key” in file /var/log/httpd/ssl_error_log

Scenario: The Apache httpd service is in error due to SSL key loading issue.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9117&amp;rev=1652187299&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2022-05-10T13:54:59+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9117</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9117&amp;rev=1652187299&amp;do=diff</link>
        <description>NodeMonitorPotentialLoadingIssues

Level:   Major

Purpose: Notify operations that a network element name has been added/renamed that can not be rendered in the Node Monitor

Scenario: A network element name has the potential to stop the Node Monitor from loading if it receives alarms</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9118&amp;rev=1671811342&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2022-12-23T16:02:22+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9118</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9118&amp;rev=1671811342&amp;do=diff</link>
        <description>GrandChildTicket

Level:   Major

Purpose: Notify operations that is a grand child ticket is on the system.

Scenario: Grand child tickets prevent a user from moving a ticket to a new state. Issue occurs occasionally when automated system and a user parent a ticket at the exact same time.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9119&amp;rev=1668611113&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2022-11-16T15:05:13+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9119</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9119&amp;rev=1668611113&amp;do=diff</link>
        <description>DuplicateSyncSchedulesCreated

Level:   Major

Purpose: Temporary measure until &lt;https://errigal.atlassian.net/browse/OPSDEV-284&gt; is fixed. Bug in snmp manager which is creating duplicate alarm severity syncs in network_element_sync_setting table

Scenario: Alarm sync has no recent successful alarm severity sync, so the snmp manager creates a new one but doesn&#039;t check if there&#039;s one there already</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9120&amp;rev=1669028116&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2022-11-21T10:55:16+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9120</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9120&amp;rev=1669028116&amp;do=diff</link>
        <description>ExteNetInocUserHasChangedUserprofilePermissions

Level:   Major

Purpose: Notify operations when an ExteNet INOC user changes their user account profile/visibility.

Scenario: A INOC user has changed their account to allow them access to additional areas of the IDMS suite. INOC users should have the following permissions:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9121&amp;rev=1671116386&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2022-12-15T14:59:46+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9121</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9121&amp;rev=1671116386&amp;do=diff</link>
        <description>ChildTicketCountTooLarge

Level:   Major

Purpose: Report when a ticket in the Extenet ticketer has more than 1500 child tickets, which can have negative affects on the system

Scenario: Users have grouped too many tickets

Resolution: Notify Extenet that there are tickets that have a growing number of child tickets and should be closed before they affect the system</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9124&amp;rev=1707292769&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2024-02-07T07:59:29+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9124</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9124&amp;rev=1707292769&amp;do=diff</link>
        <description>MissingNetworkIdentifiers

Level:   Major

Purpose: A lot of ATCs processes, including SLA outages and Salesforce integration, rely on the network identifier. If an element is incorrectly loaded or loaded using the topology discovery, this field may be incorrect. This will need to be populated using the parent elements site - site number + side identifier e.g. 211597 Xcel Energy Center</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9125&amp;rev=1698756573&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2023-10-31T12:49:33+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9125</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9125&amp;rev=1698756573&amp;do=diff</link>
        <description>SalesForceNoCarrierUpdateFor7Days

Level: Critical

Purpose: Notify ATC Team that SalesForce has not sent a Carrier Update for 7 days.

Scenario: Due to the new customer-focused methodology that was introduced to SalesForce in 2023, more customer interaction is required by SalesForce. This means that if we do not receive CUs (Carrier Update) or FMs (Force Majeure) from them, it might become a problem for SalesForce as they are not receiving anything from their customers.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9126&amp;rev=1685114685&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2023-05-26T16:24:45+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9126</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9126&amp;rev=1685114685&amp;do=diff</link>
        <description>SalesForceNoForceMajeureForManyDays

Level: Critical

Purpose: Notify ATC team that SalesForce has not sent a Force Majeure for Many days.

Scenario: Due to the new customer-focused methodology that was introduced to SalesForce in 2023, more customer interaction is required by SalesForce. This means that if we do not receive CUs (Carrier Update) or FMs (Force Majeure) from them, it might become a problem for SalesForce as they are not receiving anything from their customers.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9127&amp;rev=1682585954&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2023-04-27T09:59:14+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9127</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9127&amp;rev=1682585954&amp;do=diff</link>
        <description>MissingBillingTiers

Level: Critical

Purpose: Notify Errigal Operations staff that a network element has been created which does not have a billing tier set. 

Scenario: If there is no billing tier set this will cause issues with Billing at the end of the month.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9128&amp;rev=1699616416&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2023-11-10T11:40:16+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9128</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9128&amp;rev=1699616416&amp;do=diff</link>
        <description>SalesforceAuthError

Level: Major

Purpose: Notify Errigal Operations staff that ticketer integration had an authentication error when trying to connect to salesforce

Scenario: If the Salesforce API failed due to maintenance or API error on the Salesforce side. Example error in ticketer integration logs:</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9129&amp;rev=1723138988&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2024-08-08T18:43:08+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9129</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9129&amp;rev=1723138988&amp;do=diff</link>
        <description>ElastAlertNoScrape

Level: Critical

Purpose: Notify Errigal Operations staff that the elastalert application on prometheus prod has not completed a scrape on the DF-80 (80% disk full) or DF-60 (60% disk full) Elast Alert rules in the last 2 minutes.</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9130&amp;rev=1726735246&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2024-09-19T09:40:46+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9130</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9130&amp;rev=1726735246&amp;do=diff</link>
        <description>MdcTasksFailing

Level: Major

Purpose: Notify operations that the percentage of MDC failed tasks has reached greater than 60% for more than 1 hour. Percentage is the amount of failed tasks in orchestrator database vs total processed tasks (failed + completed) for the last hour</description>
    </item>
    <item rdf:about="http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9131&amp;rev=1732805578&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2024-11-28T14:52:58+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>resolution_area:prometheus_resolutions:res-p9131</title>
        <link>http://3.86.49.49/doku.php?id=resolution_area:prometheus_resolutions:res-p9131&amp;rev=1732805578&amp;do=diff</link>
        <description>JMA Xran Syncs Failing</description>
    </item>
</rdf:RDF>
