|
| 1 | +# --- Purpose --- |
| 2 | +# Get a slack alert when a cluster has been disconnected for longer than your defined interval (default is 1 hour) |
| 3 | +# The intent is to run this script on a scheduled interval of your choice to work as a proactive disconnected alerting system. |
| 4 | +# |
| 5 | +# --- Required Steps --- |
| 6 | +# ALL USERS need to: |
| 7 | +# 1. populate the API_KEY variable with an API key they generated from fleet.scalecomputing.com. |
| 8 | +# 2. input your webhook URL generated by the workflow builder,and to have your workflow accept "text" as a variable. https://slack.com/help/articles/360035692513-Guide-to-Slack-Workflow-Builder |
| 9 | +# |
| 10 | +# --- Optional Configuration --- |
| 11 | +# Customize the Message variables as needed for your preferences |
| 12 | +# Note: If you have >200 clusters, you will need to implement offsets and limits to iterate through all clusters |
| 13 | +# Increase or decrease the timedelta required to consider a cluster disconnected (default is 1 hour) |
| 14 | +# Modify the script if you do not want to be notifed when al clusters are online. |
| 15 | +# |
| 16 | +# --- How this script works --- |
| 17 | +# This script gets all clusters, looks at how long in the past the lastCheckin value is, and if it exceeds hte defined tiem delta, sends a slack alert about the cluster(s) being offline. If no clusters are offline, a confirmation message is sent. |
| 18 | + |
| 19 | +import requests |
| 20 | +import json |
| 21 | +from datetime import datetime, timezone, timedelta |
| 22 | + |
| 23 | +# User-provided API and Webhook details |
| 24 | +API_KEY = 'INSERT API KEY' # You must replace this with your actual API Key |
| 25 | +SLACK_WEBHOOK_URL = 'INSERT SLACK WEBHOOK URL' # You must replace this with your actual Slack webhook URL |
| 26 | + |
| 27 | +# API endpoint details |
| 28 | +API_URL = 'https://api.scalecomputing.com/api/v2/clusters?offset=0&limit=200' |
| 29 | +HEADERS = { |
| 30 | + 'accept': 'application/json', |
| 31 | + 'api-key': API_KEY |
| 32 | +} |
| 33 | + |
| 34 | +def get_stale_clusters(): |
| 35 | + """Fetches clusters from the API and identifies those with stale check-ins.""" |
| 36 | + try: |
| 37 | + response = requests.get(API_URL, headers=HEADERS) |
| 38 | + response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx) |
| 39 | + data = response.json() |
| 40 | + |
| 41 | + stale_clusters = [] |
| 42 | + now = datetime.now(timezone.utc) |
| 43 | + |
| 44 | + for item in data.get('items', []): |
| 45 | + last_checkin_str = item.get('lastCheckin') |
| 46 | + if last_checkin_str: |
| 47 | + last_checkin_time = datetime.fromisoformat(last_checkin_str.replace('Z', '+00:00')) |
| 48 | + time_difference = now - last_checkin_time |
| 49 | + |
| 50 | + if time_difference > timedelta(hours=1): # Optional: change the "disconnected time" from the default one hour here |
| 51 | + stale_clusters.append({ |
| 52 | + 'id': item.get('id'), |
| 53 | + 'name': item.get('name'), |
| 54 | + 'lastCheckin': last_checkin_str, |
| 55 | + 'onlineStatus': item.get('onlineStatus'), |
| 56 | + 'healthState': item.get('healthState') |
| 57 | + }) |
| 58 | + return stale_clusters |
| 59 | + |
| 60 | + except requests.exceptions.RequestException as e: |
| 61 | + print(f"Error making API request: {e}") |
| 62 | + return None |
| 63 | + |
| 64 | +def send_slack_message(clusters): |
| 65 | + """Formats and sends a message to Slack via webhook.""" |
| 66 | + if not clusters: |
| 67 | + message = "All clusters have checked in within the last hour. No issues found. ✅" |
| 68 | + else: |
| 69 | + cluster_list = '\n'.join([ |
| 70 | + f"- Cluster Name: {c['name']}, Last Checkin: {c['lastCheckin']}, Status: {c['healthState']}, FM URL: https://fleet.scalecomputing.com/clusters/{c['id']}" |
| 71 | + for c in clusters |
| 72 | + ]) |
| 73 | + message = f"🚨 The following clusters have not checked in for over an hour:\n\n{cluster_list}" |
| 74 | + |
| 75 | + slack_payload = { |
| 76 | + "text": message |
| 77 | + } |
| 78 | + |
| 79 | + try: |
| 80 | + response = requests.post(SLACK_WEBHOOK_URL, data=json.dumps(slack_payload), headers={'Content-Type': 'application/json'}) |
| 81 | + response.raise_for_status() |
| 82 | + print("Slack message sent successfully.") |
| 83 | + except requests.exceptions.RequestException as e: |
| 84 | + print(f"Error sending message to Slack: {e}") |
| 85 | + |
| 86 | +if __name__ == '__main__': |
| 87 | + stale_clusters = get_stale_clusters() |
| 88 | + if stale_clusters is not None: |
| 89 | + send_slack_message(stale_clusters) |
0 commit comments