add alerts

- worker: change Adapter._construct to receive an extra arg - worker: hardcode a bad result and write it off on workloop fail
2018-08-30 22:22:02 -03:00 · 2018-08-30 22:22:02 -03:00 · 7a2f9a95c7
commit 7a2f9a95c7
parent a04d44e7de
6 changed files with 140 additions and 14 deletions
--- a/config.example.py
+++ b/config.example.py
@ -9,7 +9,8 @@ SERVICES = {
        'adapter_args': {
            'url': 'https://elixi.re/api/hello'
        },
-        'poll': 10
+        'poll': 10,
+        'alerts': ['beep'],
    },
    'dabian': {
        'description': 'elixi.re main server',
@ -17,6 +18,14 @@ SERVICES = {
        'adapter_args': {
            'address': '192.168.1.1'
        },
-        'poll': 15
+        'poll': 15,
+        'alerts': ['beep'],
+    }
+}
+
+ALERTS = {
+    'bepsi': {
+        'type': 'discord',
+        'url': 'beep boop'
    }
 }
--- a/elstat/adapters.py
+++ b/elstat/adapters.py
@ -19,7 +19,14 @@ class Adapter:
    @classmethod
    def _construct(cls, *args) -> dict:
        columns = cls.spec['db'][1:]
-        return {col: args[idx] for idx, col in enumerate(columns)}
+
+        base = {col: args[idx] for idx, col in enumerate(columns)}
+
+        # if theres an extra arg, treat it as the error field
+        if len(args) > len(columns):
+            base['error'] = args[len(args) - 1]
+
+        return base


 class PingAdapter(Adapter):
@ -77,12 +84,16 @@ class HttpAdapter(Adapter):
        resp = await session.get(f'{adp_args["url"]}')
        t_end = time.monotonic()

+        succ = resp.status == 200
        latency = round((t_end - t_start) * 1000)

+        # drop latency to 0 to signal a non-success
+        latency = latency if succ else 0
+
        worker.log.info(f'status={resp.status} latency={latency}ms')

-        if resp.status == 200:
-            return cls._construct(True, latency)
+        if not succ:
+            err_str = f'HTTP Status - {resp.status}'
+            return cls._construct(succ, latency, err_str)

-        # use 0ms drops as failures
-        return cls._construct(False, 0)
+        return cls._construct(succ, latency if succ else 0)
--- a/elstat/alerts.py
+++ b/elstat/alerts.py
@ -0,0 +1,40 @@
+import logging
+
+from aiohttp import ClientSession
+
+
+log = logging.getLogger(__name__)
+
+
+class DiscordAlert:
+    def __init__(self, alert_name: str, alert: dict):
+        self.name = alert_name
+        self.url = alert['url']
+        self.session = ClientSession()
+
+    def _make_payload(self, service, status):
+        serv_name = service['name']
+        is_up = status['status']
+
+        err = status.get('error', 'No error provided')
+        color = 0x00ff00 if is_up else 0xff0000
+
+        embed = {
+            'title': serv_name,
+            'color': color
+        }
+
+        if not is_up:
+            embed['description'] = err
+
+        return {
+            'content': '',
+            'embeds': [embed],
+        }
+
+    async def post(self, service: dict, status: dict):
+        payload = self._make_payload(service, status)
+
+        log.warning(f'Posting an alert! {status.get("error")}')
+        async with self.session.post(self.url, json=payload) as resp:
+            return resp
--- a/elstat/consts.py
+++ b/elstat/consts.py
@ -1,4 +1,5 @@
 from .adapters import HttpAdapter, PingAdapter
+from .alerts import DiscordAlert


 ADAPTERS = {
@ -6,6 +7,10 @@ ADAPTERS = {
    'ping': PingAdapter,
 }

+ALERTS = {
+    'discord': DiscordAlert
+}
+

 class IncidentType:
    OUTAGE = 'outage'
--- a/elstat/manager.py
+++ b/elstat/manager.py
@ -1,9 +1,9 @@
 import logging
 import json

-from typing import List, Dict, Any
+from typing import List, Any

-from .consts import ADAPTERS
+from .consts import ADAPTERS, ALERTS
 from .worker import ServiceWorker

 from .blueprints.streaming import OP
@ -25,6 +25,7 @@ class ServiceManager:
        self.loop = app.loop

        self.workers = {}
+        self.alerts = {}
        self.state = {}
        self.subscribers = {}
        self._websockets = {}
@ -74,14 +75,17 @@ class ServiceManager:
    def _create_channels(self, worker):
        columns = worker.adapter.spec['db']

+        # each service has a status and latency channel
        self._check(columns, 'status', worker.name)
        self._check(columns, 'latency', worker.name)

    def _start(self):
        self.subscribers['incidents'] = []

+        # init services
        for name, service in self.cfg.SERVICES.items():
            self._make_db_table(name, service)
+            service['name'] = name

            # spawn a service worker
            serv_worker = ServiceWorker(self, name, service)
@ -90,6 +94,11 @@ class ServiceManager:

            self._create_channels(serv_worker)

+        # init alerts
+        for name, alert in self.cfg.ALERTS.items():
+            alert_cls = ALERTS[alert['type']]
+            self.alerts[name] = alert_cls(name, alert)
+
    def close(self):
        for worker in self.workers.values():
            worker.stop()
--- a/elstat/worker.py
+++ b/elstat/worker.py
@ -22,12 +22,20 @@ class ServiceWorker:
    async def process_work(self, result: dict):
        """Process given adapter result and insert into
        the database."""
+        try:
+            # we work with a copy of result for main db
+            # operations (without error field)
+            db_res = dict(result)
+            db_res.pop('error')
+        except KeyError:
+            pass
+
        columns = self.adapter.spec['db']
        conn = self.manager.conn

        timestamp = int(time.time() * 1000)

-        args_str = ','.join(['?'] * (len(result) + 1))
+        args_str = ','.join(['?'] * (len(db_res) + 1))
        query = f"""
        INSERT INTO {self.name} ({','.join(columns)})
        VALUES ({args_str})
@ -35,21 +43,57 @@ class ServiceWorker:

        args = []
        for col in columns[1:]:
-            val = result[col]
+            val = db_res[col]
            args.append(val)

        conn.execute(query, (timestamp, ) + tuple(args))
        conn.commit()

        await self._dispatch_work(columns, timestamp, result)
+        await self._check_alert(result)

    async def _dispatch_work(self, columns, timestamp: int, result: tuple):
+        """Dispatch the work done by the adapter
+        through the channels"""
        prechan = columns[1:]
        chans = [f'{chan}:{self.name}' for chan in prechan]

        for col, chan in zip(prechan, chans):
            self.manager.publish(chan, [timestamp, result[col]])

+    async def _check_alert(self, work):
+        """Check if any alerts should be thrown off by status changes."""
+        cur = self.manager.conn.cursor()
+
+        cur.execute(f"""
+        SELECT status FROM {self.name}
+        ORDER BY timestamp DESC
+        LIMIT 2
+        """)
+
+        rows = cur.fetchall()
+
+        # extract latest and old from rows
+        first, last = rows
+        first_status, last_status = first[0], last[0]
+
+        # dont do anything if theres no change
+        # to the statuses
+        if first_status == last_status:
+            return
+
+        # oopsie whoopsie time to alertie owo
+        alerts = self.service.get('alerts', [])
+
+        for alert in alerts:
+            try:
+                alert_obj = self.manager.alerts[alert]
+            except KeyError:
+                self.log.error(f'alert not found: {alert!r}')
+                continue
+
+            await alert_obj.post(self.service, work)
+
    async def _work_loop(self):
        try:
            while True:
@ -62,11 +106,19 @@ class ServiceWorker:
                await asyncio.sleep(self.service['poll'])
        except asyncio.CancelledError:
            self.log.info('cancelled, stopping')
-        except Exception:
+        except Exception as err:
            self.log.exception('fail on work loop, retrying')
            try:
-                self.manager.state[self.name]['status'] = False
-                self.manager.publish(f'status:{self.name}', False)
+                # hardcode a bad result on workloop failures
+                result = {
+                    'status': False,
+                    'latency': 0,
+                    'error': str(err),
+                }
+
+                # FORCE EVERYONE TO KNOW ABOUT THAT FAILURE
+                self.manager.state[self.name] = result
+                await self.process_work(result)
            except KeyError:
                pass
            await self._work_loop()