Skip to content

Commit 9a34ca0

Browse files
taylor-leickForrestHansen
authored andcommitted
Create disconnected-alert.py
Disconnected alert (plus a python example)
1 parent 04b0764 commit 9a34ca0

File tree

1 file changed

+89
-0
lines changed

1 file changed

+89
-0
lines changed
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# --- Purpose ---
2+
# Get a slack alert when a cluster has been disconnected for longer than your defined interval (default is 1 hour)
3+
# The intent is to run this script on a scheduled interval of your choice to work as a proactive disconnected alerting system.
4+
#
5+
# --- Required Steps ---
6+
# ALL USERS need to:
7+
# 1. populate the API_KEY variable with an API key they generated from fleet.scalecomputing.com.
8+
# 2. input your webhook URL generated by the workflow builder,and to have your workflow accept "text" as a variable. https://slack.com/help/articles/360035692513-Guide-to-Slack-Workflow-Builder
9+
#
10+
# --- Optional Configuration ---
11+
# Customize the Message variables as needed for your preferences
12+
# Note: If you have >200 clusters, you will need to implement offsets and limits to iterate through all clusters
13+
# Increase or decrease the timedelta required to consider a cluster disconnected (default is 1 hour)
14+
# Modify the script if you do not want to be notifed when al clusters are online.
15+
#
16+
# --- How this script works ---
17+
# This script gets all clusters, looks at how long in the past the lastCheckin value is, and if it exceeds hte defined tiem delta, sends a slack alert about the cluster(s) being offline. If no clusters are offline, a confirmation message is sent.
18+
19+
import requests
20+
import json
21+
from datetime import datetime, timezone, timedelta
22+
23+
# User-provided API and Webhook details
24+
API_KEY = 'INSERT API KEY' # You must replace this with your actual API Key
25+
SLACK_WEBHOOK_URL = 'INSERT SLACK WEBHOOK URL' # You must replace this with your actual Slack webhook URL
26+
27+
# API endpoint details
28+
API_URL = 'https://api.scalecomputing.com/api/v2/clusters?offset=0&limit=200'
29+
HEADERS = {
30+
'accept': 'application/json',
31+
'api-key': API_KEY
32+
}
33+
34+
def get_stale_clusters():
35+
"""Fetches clusters from the API and identifies those with stale check-ins."""
36+
try:
37+
response = requests.get(API_URL, headers=HEADERS)
38+
response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)
39+
data = response.json()
40+
41+
stale_clusters = []
42+
now = datetime.now(timezone.utc)
43+
44+
for item in data.get('items', []):
45+
last_checkin_str = item.get('lastCheckin')
46+
if last_checkin_str:
47+
last_checkin_time = datetime.fromisoformat(last_checkin_str.replace('Z', '+00:00'))
48+
time_difference = now - last_checkin_time
49+
50+
if time_difference > timedelta(hours=1): # Optional: change the "disconnected time" from the default one hour here
51+
stale_clusters.append({
52+
'id': item.get('id'),
53+
'name': item.get('name'),
54+
'lastCheckin': last_checkin_str,
55+
'onlineStatus': item.get('onlineStatus'),
56+
'healthState': item.get('healthState')
57+
})
58+
return stale_clusters
59+
60+
except requests.exceptions.RequestException as e:
61+
print(f"Error making API request: {e}")
62+
return None
63+
64+
def send_slack_message(clusters):
65+
"""Formats and sends a message to Slack via webhook."""
66+
if not clusters:
67+
message = "All clusters have checked in within the last hour. No issues found. ✅"
68+
else:
69+
cluster_list = '\n'.join([
70+
f"- Cluster Name: {c['name']}, Last Checkin: {c['lastCheckin']}, Status: {c['healthState']}, FM URL: https://fleet.scalecomputing.com/clusters/{c['id']}"
71+
for c in clusters
72+
])
73+
message = f"🚨 The following clusters have not checked in for over an hour:\n\n{cluster_list}"
74+
75+
slack_payload = {
76+
"text": message
77+
}
78+
79+
try:
80+
response = requests.post(SLACK_WEBHOOK_URL, data=json.dumps(slack_payload), headers={'Content-Type': 'application/json'})
81+
response.raise_for_status()
82+
print("Slack message sent successfully.")
83+
except requests.exceptions.RequestException as e:
84+
print(f"Error sending message to Slack: {e}")
85+
86+
if __name__ == '__main__':
87+
stale_clusters = get_stale_clusters()
88+
if stale_clusters is not None:
89+
send_slack_message(stale_clusters)

0 commit comments

Comments
 (0)