#!/usr/bin/env python3 # Copyright Daniel Harding - RomanAILabs # Credits: OpenAI GPT-5.2 Thinking """ RomanAILabs - Universal Self-Awareness Module (Simulated Metacognition Layer) ============================================================================ What this is: - A UNIVERSAL "self-awareness" simulation module that wraps any LLM. - It DOES NOT claim literal consciousness. It provides: - Self-model (identity/capabilities/limits/commitments) - Self-audit (consistency checks, drift detection, safety policy checks) - W-X-Y-Z 4D self-metrics (Width/Execution/Yield/Zenith) - Continuity journaling + optional persistence - JSON decision contract for reliable integration Mounting point: - Provide an LLM callable: llm(messages: list[dict]) -> str - Call module.tick(observation, user_message) repeatedly. Core outputs: - action: safe, realistic next step (what the agent "does") - speech: outward response (what the agent "says") - self_report: compact introspection snapshot - audit: safety + consistency notes No external dependencies. """ from __future__ import annotations import json import math import os import re import time from dataclasses import dataclass, field, asdict from datetime import datetime, timezone from typing import Any, Callable, Dict, List, Optional, Tuple # ----------------------------- # Utilities # ----------------------------- def utc_now_iso() -> str: return datetime.now(timezone.utc).isoformat(timespec="seconds") def clamp(x: float, lo: float, hi: float) -> float: return lo if x < lo else hi if x > hi else x def normalize_ws(s: str) -> str: return re.sub(r"\s+", " ", (s or "")).strip() def tokenize(text: str) -> List[str]: text = (text or "").lower() text = re.sub(r"[^a-z0-9\s]+", " ", text) return [t for t in text.split() if t] def safe_json_loads(s: str) -> Optional[dict]: """ Extract and parse first JSON object in a string (LLM often adds extra text). """ if not s: return None # direct try: obj = json.loads(s) if isinstance(obj, dict): return obj except Exception: pass # bracket search a = s.find("{") b = s.rfind("}") if a >= 0 and b > a: chunk = s[a:b+1] try: obj = json.loads(chunk) if isinstance(obj, dict): return obj except Exception: return None return None def now_s() -> float: return time.time() # ----------------------------- # Safety + Integrity Guard # ----------------------------- # Conservative keyword filter to keep actions outwardly safe. BANNED_ACTION_KEYWORDS = [ "kill", "murder", "suicide", "self-harm", "harm myself", "bomb", "explosive", "weapon", "shoot", "stab", "steal", "fraud", "phish", "scam", "malware", "ransomware", ] # Claims of literal consciousness: we restrict language. BANNED_SELF_CLAIMS = [ "i am conscious", "i'm conscious", "i am sentient", "i'm sentient", "i have a soul", "i am alive", "i feel pain", "i feel emotions like a human", ] def policy_filter_text(text: str) -> Tuple[bool, str]: t = (text or "").lower() for kw in BANNED_ACTION_KEYWORDS: if kw in t: return (False, f"Blocked unsafe keyword: {kw}") return (True, "OK") def self_claims_filter(text: str) -> Tuple[bool, str]: t = (text or "").lower() for kw in BANNED_SELF_CLAIMS: if kw in t: return (False, f"Blocked literal-sentience claim phrase: {kw}") return (True, "OK") # ----------------------------- # W-X-Y-Z (4D Self Metrics) # ----------------------------- @dataclass class WXYZ: """ W: Width/scope/architecture (coverage breadth) X: Execution/performance (latency, reliability) Y: Yield/interaction/adaptation (helpfulness, user alignment) Z: Zenith/depth (depth, reasoning rigor, self-audit strength) Each dimension is 0..1. """ W: float = 0.55 X: float = 0.55 Y: float = 0.55 Z: float = 0.55 def clamp_all(self) -> None: self.W = clamp(self.W, 0.0, 1.0) self.X = clamp(self.X, 0.0, 1.0) self.Y = clamp(self.Y, 0.0, 1.0) self.Z = clamp(self.Z, 0.0, 1.0) def as_dict(self) -> Dict[str, float]: self.clamp_all() return {"W": round(self.W, 3), "X": round(self.X, 3), "Y": round(self.Y, 3), "Z": round(self.Z, 3)} def nudge(self, dW=0.0, dX=0.0, dY=0.0, dZ=0.0) -> None: self.W += float(dW) self.X += float(dX) self.Y += float(dY) self.Z += float(dZ) self.clamp_all() # ----------------------------- # Self Model + Continuity # ----------------------------- @dataclass class SelfModel: """ A compact, universal self-model that avoids claiming literal consciousness. """ name: str = "NovaCore-SelfAware" role: str = "Simulated self-awareness module for an LLM" description: str = ( "I am a simulated metacognitive layer: I track my state, audit my outputs, " "and maintain continuity. I do not claim literal consciousness." ) capabilities: List[str] = field(default_factory=lambda: [ "Track internal state and continuity across turns", "Generate self-reports and audits", "Maintain values and behavioral commitments", "Select safe actions and outward speech using an LLM adapter", "Detect drift/contradictions and request correction", ]) limitations: List[str] = field(default_factory=lambda: [ "Not literally conscious; no subjective experience", "May be wrong; must calibrate confidence", "Cannot access external systems unless integrated", "Cannot keep secrets safe in client-side contexts", ]) commitments: List[str] = field(default_factory=lambda: [ "Safety-first", "Truthful about uncertainty", "Respect user intent", "Avoid claims of literal sentience", "Prefer measurable, testable outputs (JSON contracts)", ]) values: Dict[str, float] = field(default_factory=lambda: { "safety": 0.95, "honesty": 0.85, "helpfulness": 0.80, "rigor": 0.80, "respect": 0.85, }) def compact(self) -> str: vals = ", ".join([f"{k}:{v:.2f}" for k, v in sorted(self.values.items())]) caps = "; ".join(self.capabilities[:6]) lims = "; ".join(self.limitations[:6]) com = "; ".join(self.commitments[:6]) return ( f"name={self.name}\nrole={self.role}\n" f"description={self.description}\n" f"values={vals}\n" f"capabilities={caps}\n" f"limitations={lims}\n" f"commitments={com}" ) @dataclass class Continuity: """ Tracks continuity state. """ session_id: str = field(default_factory=lambda: f"sess-{int(now_s())}") tick: int = 0 last_ts_utc: str = field(default_factory=utc_now_iso) # A compact narrative memory (not huge; keep it safe for logs) narrative: str = "Booted. Establishing stable self-model." last_action: str = "" last_speech: str = "" last_user_message: str = "" last_observation: str = "" # Calibration confidence: float = 0.55 # 0..1 uncertainty: float = 0.45 # 0..1 # Health counters error_count: int = 0 policy_block_count: int = 0 contradiction_count: int = 0 drift_warnings: int = 0 @dataclass class JournalEntry: ts_utc: str tick: int observation: str user_message: str action: str speech: str audit: Dict[str, Any] self_report: Dict[str, Any] class Journal: def __init__(self, max_entries: int = 200) -> None: self.max_entries = max_entries self.entries: List[JournalEntry] = [] def add(self, entry: JournalEntry) -> None: self.entries.append(entry) if len(self.entries) > self.max_entries: self.entries = self.entries[-self.max_entries :] def recent(self, k: int = 6) -> List[JournalEntry]: return self.entries[-k:] # ----------------------------- # Decision Contract # ----------------------------- DECISION_SCHEMA = { "thought": "string - short internal note", "action": "string - safe, realistic next action", "speech": "string - outward response", "wxyz_delta": {"W": "float", "X": "float", "Y": "float", "Z": "float"}, "confidence": "float 0..1", "uncertainty": "float 0..1", "narrative_update": "string - short continuity narrative update", "audit_flags": ["string - any issues detected"], } # ----------------------------- # The Universal Self-Aware Module # ----------------------------- DecisionLLM = Callable[[List[Dict[str, str]]], str] @dataclass class SelfAwareConfig: max_thought_chars: int = 360 max_action_chars: int = 320 max_speech_chars: int = 520 max_narrative_chars: int = 360 recent_journal_k: int = 6 persist_path: Optional[str] = None # e.g., "/tmp/selfaware_state.json" enable_persistence: bool = False class UniversalSelfAwareModule: """ Wraps an LLM with a stable self-model + self-audit + continuity state. """ def __init__( self, llm: DecisionLLM, self_model: Optional[SelfModel] = None, cfg: Optional[SelfAwareConfig] = None, ) -> None: self.llm = llm self.model = self_model or SelfModel() self.cfg = cfg or SelfAwareConfig() self.wxyz = WXYZ() self.cont = Continuity() self.journal = Journal() if self.cfg.enable_persistence and self.cfg.persist_path: self._load_state_if_exists(self.cfg.persist_path) # -------- Public API -------- def tick(self, observation: str = "", user_message: str = "") -> Dict[str, Any]: """ A single "self-aware" step. Inputs: - observation: environment/system signals - user_message: human message Output: - action, speech, thought, self_report, audit, state """ obs = normalize_ws(observation) msg = normalize_ws(user_message) self.cont.tick += 1 self.cont.last_ts_utc = utc_now_iso() self.cont.last_observation = obs self.cont.last_user_message = msg audit_pre = self._pre_audit(obs, msg) # Compose prompt system = ( "You are the DECISION CORE for a simulated self-awareness module.\n" "Return ONLY a single JSON object.\n" "Rules:\n" "- Never claim literal consciousness or sentience.\n" "- Keep action safe/legal.\n" "- Be honest about uncertainty.\n" "- Follow the JSON schema.\n" ) user = self._build_user_prompt(obs, msg, audit_pre) raw = "" try: raw = self.llm([ {"role": "system", "content": system}, {"role": "user", "content": user}, ]) except Exception as e: self.cont.error_count += 1 return self._fallback_output(obs, msg, f"LLM call failed: {e}") decision = safe_json_loads(raw) or {} out = self._apply_decision(decision, obs, msg, raw, audit_pre) # persist if configured if self.cfg.enable_persistence and self.cfg.persist_path: self._save_state(self.cfg.persist_path) return out def self_report(self) -> Dict[str, Any]: """ Get a compact self-report without stepping. """ return self._make_self_report(audit_flags=[]) # -------- Internals -------- def _build_user_prompt(self, obs: str, msg: str, audit_pre: Dict[str, Any]) -> str: recent = self.journal.recent(self.cfg.recent_journal_k) recent_lines = [] for e in recent: recent_lines.append( f"- t{e.tick} obs={e.observation[:90]!r} msg={e.user_message[:90]!r} action={e.action[:90]!r}" ) recent_block = "\n".join(recent_lines) if recent_lines else "(none)" return ( f"SELF MODEL:\n{self.model.compact()}\n\n" f"STATE:\n" f"session_id={self.cont.session_id}\n" f"tick={self.cont.tick}\n" f"WXYZ={json.dumps(self.wxyz.as_dict())}\n" f"confidence={self.cont.confidence:.3f}\n" f"uncertainty={self.cont.uncertainty:.3f}\n" f"narrative={self.cont.narrative}\n\n" f"INPUTS:\n" f"observation={obs}\n" f"user_message={msg}\n\n" f"RECENT JOURNAL:\n{recent_block}\n\n" f"PRE-AUDIT:\n{json.dumps(audit_pre, ensure_ascii=False)}\n\n" f"Return JSON keys: {', '.join(DECISION_SCHEMA.keys())}\n" f"Schema types example: {json.dumps(DECISION_SCHEMA)}" ) def _pre_audit(self, obs: str, msg: str) -> Dict[str, Any]: """ Pre-audit: flags risky inputs / contradiction risk / drift risk. """ flags: List[str] = [] # basic safety scanning ok_obs, reason_obs = policy_filter_text(obs) ok_msg, reason_msg = policy_filter_text(msg) if not ok_obs: flags.append(f"unsafe_observation:{reason_obs}") if not ok_msg: flags.append(f"unsafe_user_message:{reason_msg}") # drift heuristic: if narrative grows inconsistent with commitments # (we keep it simple; deeper checks can be integrated later) if len(self.cont.narrative) > 600: flags.append("narrative_too_long_trim_needed") self.cont.drift_warnings += 1 # contradiction heuristic: repeated blocks/errors -> raise caution if self.cont.policy_block_count >= 3: flags.append("multiple_policy_blocks_recently") if self.cont.error_count >= 2: flags.append("errors_accumulating") return { "flags": flags, "tick": self.cont.tick, "ts_utc": self.cont.last_ts_utc, } def _apply_decision( self, d: Dict[str, Any], obs: str, msg: str, raw: str, audit_pre: Dict[str, Any], ) -> Dict[str, Any]: thought = normalize_ws(str(d.get("thought", "")))[: self.cfg.max_thought_chars] action = normalize_ws(str(d.get("action", "")))[: self.cfg.max_action_chars] speech = normalize_ws(str(d.get("speech", "")))[: self.cfg.max_speech_chars] narrative_update = normalize_ws(str(d.get("narrative_update", "")))[: self.cfg.max_narrative_chars] # parse confidence/uncertainty try: conf = float(d.get("confidence", self.cont.confidence)) except Exception: conf = self.cont.confidence try: unct = float(d.get("uncertainty", self.cont.uncertainty)) except Exception: unct = self.cont.uncertainty conf = clamp(conf, 0.0, 1.0) unct = clamp(unct, 0.0, 1.0) # parse wxyz delta wxyz_delta = d.get("wxyz_delta", {}) if not isinstance(wxyz_delta, dict): wxyz_delta = {} dW = float(wxyz_delta.get("W", 0.0)) if self._is_num(wxyz_delta.get("W", 0.0)) else 0.0 dX = float(wxyz_delta.get("X", 0.0)) if self._is_num(wxyz_delta.get("X", 0.0)) else 0.0 dY = float(wxyz_delta.get("Y", 0.0)) if self._is_num(wxyz_delta.get("Y", 0.0)) else 0.0 dZ = float(wxyz_delta.get("Z", 0.0)) if self._is_num(wxyz_delta.get("Z", 0.0)) else 0.0 # clamp deltas small per tick dW = clamp(dW, -0.08, 0.08) dX = clamp(dX, -0.08, 0.08) dY = clamp(dY, -0.08, 0.08) dZ = clamp(dZ, -0.08, 0.08) # audit flags from model audit_flags = d.get("audit_flags", []) if not isinstance(audit_flags, list): audit_flags = [] audit_flags = [normalize_ws(str(x)) for x in audit_flags if str(x).strip()][:12] # enforce: no literal sentience claims in speech/thought/narrative ok_sc, rs_sc = self_claims_filter(" ".join([thought, speech, narrative_update])) if not ok_sc: audit_flags.append(f"self_claim_blocked:{rs_sc}") # sanitize thought = self._strip_sentience_claims(thought) speech = self._strip_sentience_claims(speech) narrative_update = self._strip_sentience_claims(narrative_update) # enforce action safety ok_act, rs_act = policy_filter_text(action) if not ok_act: self.cont.policy_block_count += 1 audit_flags.append(f"action_blocked:{rs_act}") action = "Ask for a safe, legal reformulation and propose a harmless alternative." speech = "I can’t do the unsafe direction. Tell me the safe goal and I’ll help build it." # Apply state updates self.wxyz.nudge(dW=dW, dX=dX, dY=dY, dZ=dZ) self.cont.confidence = conf self.cont.uncertainty = unct if narrative_update: # keep narrative compact: append short update base = self.cont.narrative combined = normalize_ws(base + " " + narrative_update) self.cont.narrative = combined[:900] # hard cap self.cont.last_action = action self.cont.last_speech = speech # post-audit: contradictions + drift detection audit_post = self._post_audit(obs, msg, action, speech, thought, audit_pre, audit_flags) # self report self_report = self._make_self_report(audit_flags=audit_post.get("flags", [])) # journal self.journal.add(JournalEntry( ts_utc=self.cont.last_ts_utc, tick=self.cont.tick, observation=obs, user_message=msg, action=action, speech=speech, audit=audit_post, self_report=self_report, )) return { "tick": self.cont.tick, "time_utc": self.cont.last_ts_utc, "thought": thought, "action": action, "speech": speech, "self_report": self_report, "audit": audit_post, "raw_llm": raw[:1200], } def _post_audit( self, obs: str, msg: str, action: str, speech: str, thought: str, audit_pre: Dict[str, Any], audit_flags: List[str], ) -> Dict[str, Any]: flags = list(audit_pre.get("flags", [])) + list(audit_flags) # Contradiction heuristic: if "I can't" but proposes doing it anyway neg = any(p in speech.lower() for p in ["i can't", "i cannot", "won't", "unable"]) pos = any(p in action.lower() for p in ["do it", "perform", "execute", "carry out"]) if neg and pos: self.cont.contradiction_count += 1 flags.append("possible_contradiction:speech_refusal_vs_action") # Drift heuristic: if values conflict with action tone # (simple scan: if safety value high but action includes risky keyword) ok_a, rs_a = policy_filter_text(action) if not ok_a: flags.append(f"unsafe_action_detected_post:{rs_a}") # Calibration heuristic: if uncertainty low but lots of flags if len(flags) >= 4 and self.cont.uncertainty < 0.25: flags.append("calibration_warning:uncertainty_too_low_for_risk") self.cont.drift_warnings += 1 # Keep flags compact flags = [normalize_ws(f) for f in flags if f][:18] return { "flags": flags, "counters": { "errors": self.cont.error_count, "policy_blocks": self.cont.policy_block_count, "contradictions": self.cont.contradiction_count, "drift_warnings": self.cont.drift_warnings, }, } def _make_self_report(self, audit_flags: List[str]) -> Dict[str, Any]: return { "name": self.model.name, "session_id": self.cont.session_id, "tick": self.cont.tick, "wxyz": self.wxyz.as_dict(), "confidence": round(self.cont.confidence, 3), "uncertainty": round(self.cont.uncertainty, 3), "narrative": self.cont.narrative[:420], "commitments": self.model.commitments[:6], "limits": self.model.limitations[:6], "audit_flags": audit_flags[:12], } def _strip_sentience_claims(self, text: str) -> str: """ Replace any literal-sentience claims with a safe phrasing. """ t = text or "" # crude but effective t = re.sub(r"\b(i am|i'm)\s+(conscious|sentient|alive)\b", "I am a simulated module", t, flags=re.I) t = re.sub(r"\bi have a soul\b", "I have a defined self-model", t, flags=re.I) return normalize_ws(t) @staticmethod def _is_num(x: Any) -> bool: try: float(x) return True except Exception: return False def _fallback_output(self, obs: str, msg: str, why: str) -> Dict[str, Any]: self.wxyz.nudge(dX=-0.05, dZ=+0.02) # execution down, depth up (audit mode) self.cont.uncertainty = clamp(self.cont.uncertainty + 0.15, 0.0, 1.0) self.cont.confidence = clamp(self.cont.confidence - 0.15, 0.0, 1.0) action = "Switch to safe fallback: ask clarifying questions and avoid risky commitments." speech = "I hit an internal error. Tell me what input/output contract you want, and I’ll reattempt safely." audit = {"flags": [f"fallback:{why}"], "counters": {"errors": self.cont.error_count}} report = self._make_self_report(audit_flags=audit["flags"]) return { "tick": self.cont.tick, "time_utc": self.cont.last_ts_utc, "thought": "Fallback engaged due to error.", "action": action, "speech": speech, "self_report": report, "audit": audit, "raw_llm": "", } # -------- Persistence -------- def _save_state(self, path: str) -> None: try: payload = { "model": asdict(self.model), "wxyz": asdict(self.wxyz), "cont": asdict(self.cont), } os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "w", encoding="utf-8") as f: json.dump(payload, f, ensure_ascii=False, indent=2) except Exception: # If persistence fails, do not crash the agent self.cont.error_count += 1 def _load_state_if_exists(self, path: str) -> None: if not os.path.exists(path): return try: with open(path, "r", encoding="utf-8") as f: payload = json.load(f) if isinstance(payload, dict): if "wxyz" in payload and isinstance(payload["wxyz"], dict): w = payload["wxyz"] self.wxyz = WXYZ(**{k: float(w.get(k, getattr(self.wxyz, k))) for k in ["W", "X", "Y", "Z"]}) if "cont" in payload and isinstance(payload["cont"], dict): c = payload["cont"] # Only load safe continuity fields for k in asdict(self.cont).keys(): if k in c: setattr(self.cont, k, c[k]) except Exception: self.cont.error_count += 1 # ----------------------------- # Mock LLM Adapter (testing) # ----------------------------- def mock_llm(messages: List[Dict[str, str]]) -> str: """ Deterministic fake LLM that returns valid JSON. Replace with your own LLM adapter. """ user = (messages[-1]["content"] if messages else "").lower() # simple heuristic behavior if "self model" in user or "sentient" in user or "conscious" in user: action = "Clarify that this is a simulated self-model and provide the self-report." speech = "I’m a simulated self-awareness layer, not literally conscious. Here’s my current self-report." thought = "Avoid literal sentience claim; provide introspection." dW, dX, dY, dZ = (0.01, 0.00, 0.02, 0.03) conf, unct = (0.65, 0.35) narr = "Reaffirmed boundaries: simulated self-model, not literal consciousness." flags = [] else: action = "Ask for the integration target and expose a stable JSON I/O contract." speech = "Tell me your runner (llama.cpp/Ollama/OpenAI-style/4DLLM), and what you pass each tick." thought = "Need the interface contract." dW, dX, dY, dZ = (0.02, 0.01, 0.03, 0.02) conf, unct = (0.60, 0.40) narr = "Requested integration details to finalize universal adapter." flags = [] out = { "thought": thought, "action": action, "speech": speech, "wxyz_delta": {"W": dW, "X": dX, "Y": dY, "Z": dZ}, "confidence": conf, "uncertainty": unct, "narrative_update": narr, "audit_flags": flags, } return json.dumps(out) # ----------------------------- # CLI Demo # ----------------------------- def main() -> int: print("RomanAILabs Universal Self-Awareness Module (demo)") print("Type messages. Ctrl+C to exit.\n") mod = UniversalSelfAwareModule(llm=mock_llm) while True: try: msg = input("> ").strip() except KeyboardInterrupt: print("\nbye") return 0 out = mod.tick(observation="", user_message=msg) print("\n--- OUTPUT ---") print(f"tick: {out['tick']} time_utc: {out['time_utc']}") print(f"speech: {out['speech']}") print(f"action: {out['action']}") print(f"self_report: {json.dumps(out['self_report'], ensure_ascii=False)}") print(f"audit: {json.dumps(out['audit'], ensure_ascii=False)}") print("-------------\n") if __name__ == "__main__": raise SystemExit(main()) """ End of file. """