fester/backend/analysis/cause_graph.py

139 lines
3.6 KiB
Python

from collections import defaultdict
import time
class CauseGraphEngine:
"""
Builds a causal graph from system events.
This is NOT execution logic.
It is a post-hoc + live reasoning layer.
"""
def __init__(self):
# node -> causes
self.graph = defaultdict(list)
# event timeline
self.events = []
# last decision context
self.last_context = {}
# -------------------------------------------------
# INGEST EVENTS
# -------------------------------------------------
def ingest(self, event: dict):
self.events.append(event)
etype = event.get("type")
node = event.get("node")
action = event.get("action")
# store context for causal linking
if etype == "task_update":
self._handle_task(event)
elif etype == "cache_update":
self._handle_cache(event)
elif etype == "failure":
self._handle_failure(event)
elif etype == "node_update":
self._handle_node(event)
def attach_bus(self, bus):
self.bus = bus
def emit_debug(self, data):
if self.bus:
self.bus.emit(
"debug",
node=None,
action=None,
state="trace",
meta=data
)
# -------------------------------------------------
# TASK EXECUTION CAUSALITY
# -------------------------------------------------
def _handle_task(self, event):
node = event.get("node")
action = event.get("action")
state = event.get("state")
context = {
"time": time.time(),
"node": node,
"action": action,
"state": state,
"reason": "scheduler_assignment",
}
self.last_context[(node, action)] = context
self.graph[(node, action)].append(context)
# -------------------------------------------------
# CACHE CAUSALITY
# -------------------------------------------------
def _handle_cache(self, event):
node = event.get("node")
action = event.get("action")
context = {
"time": time.time(),
"node": node,
"action": action,
"reason": "cache_hit_or_miss",
}
self.graph[(node, action)].append(context)
# -------------------------------------------------
# FAILURE CAUSALITY
# -------------------------------------------------
def _handle_failure(self, event):
node = event.get("node")
action = event.get("action")
prev = self.last_context.get((node, action), {})
context = {
"time": time.time(),
"node": node,
"action": action,
"reason": "execution_failure",
"parent_context": prev,
}
self.graph[(node, action)].append(context)
# -------------------------------------------------
# NODE STATE CAUSALITY
# -------------------------------------------------
def _handle_node(self, event):
node = event.get("node")
context = {
"time": time.time(),
"node": node,
"reason": "node_state_update",
"state": event.get("node_state", {}),
}
self.graph[(node, None)].append(context)
# -------------------------------------------------
# QUERY INTERFACE
# -------------------------------------------------
def explain(self, node, action=None):
"""
Returns causal chain for a given execution unit.
"""
return self.graph.get((node, action), [])
def full_trace(self):
return dict(self.graph)