complete-computing-environment/wobserver-observability.org

5.0 KiB
Raw Permalink Blame History

Wobserver Observability

It is critical that the Wobserver can be "self-healing" not that all faults can be automatically fixed, but that with a human in the loop all common failure modes can be managed and normal operation can be maintained.

Most of the problems are disk usage and IO utilization, failed disks and the like.

INPROGRESS observability

  • State "INPROGRESS" from [2022-01-01 Sat 16:43]

I keep wanting to set up telegraf, but for now I'll use prometheus since it makes it easy to set up Prometheus exporters like Nextcloud on Wobserver shows for example.

I need to set up alerts and dashboards for the most common operations, and I'd love some declarative way to share these. Oh some day.

{ pkgs, config, ... }:

with pkgs.lib;

let mkStaticScrape =
  (name: cfg:
  let addr = 
  if hasAttr "listenAddr" cfg then
  cfg.listenAddr
  else
  "localhost";
  in
  {
    job_name = name;
    static_configs = [
      { targets = ["${addr}:${toString cfg.port}"]; }
    ];
  }
  );
in
rec {
  programs.atop.enable = true;
  services.prometheus = {
    enable = true;
    retentionTime = "60d";
    webExternalUrl = "https://home.rix.si/prom";
    listenAddress = "127.0.0.1";
    scrapeConfigs = [
      (mkStaticScrape "node" config.services.prometheus.exporters.node)
      (mkStaticScrape "process" config.services.prometheus.exporters.process)
      (mkStaticScrape "smartctl" config.services.prometheus.exporters.smartctl)
      (mkStaticScrape "postgres" config.services.prometheus.exporters.postgres)
      (mkStaticScrape "zfs" config.services.prometheus.exporters.zfs)
      {
        job_name = "octopi";
        metrics_path = "/plugin/prometheus_exporter/metrics";
        params.apikey = ["27816EF9BA5C43749A022573B0862C71"];
        static_configs = [{ targets = ["octopi:80"]; }];
      }
      {
        job_name = "arcology";
        static_configs = [{ targets = ["localhost:8000"]; }];
      }
      {
        job_name= "synapse";
        metrics_path= "/_synapse/metrics";
        static_configs = [{ targets = ["localhost:8008"]; }];
      }
      # gitea
    ];
  };

  # services.prometheus.exporters.pihole = {};
  # services.prometheus.exporters.nginx = {};
  # services.prometheus.exporters.nginxlog = {};
  # services.prometheus.exporters.unifi = {};

  services.prometheus.exporters.node = {
    enable = true;
    enabledCollectors = [ "systemd" ];
  };

  services.prometheus.exporters.process = {
    enable = true;
    settings.process_names = [
      # Remove nix store path from process name
      {
        name = "{{.Matches.Wrapped}} {{ .Matches.Args }}";
        cmdline = [ "^/nix/store[^ ]*/(?P<Wrapped>[^ /]*) (?P<Args>.*)" ];
      }
      {
        name = "{{.Matches.Command}}: {{ .Matches.Specialization }}";
        cmdline = [ "(?P<Command>[a-zA-Z0-9\-_+]+): (?P<Specialization>.*)" ];
      }
    ];
  };

  services.prometheus.exporters.smartctl.enable = true;
  services.smartd.enable = true;
  environment.systemPackages = [ pkgs.smartmontools ];

  services.prometheus.exporters.postgres = {
    enable = true;
    runAsLocalSuperUser = true;
  };
  services.prometheus.exporters.zfs.enable = true;

  services.grafana = {
    enable = true;
    dataDir = "/srv/grafana";

    settings = {
      analytics.reporting_enable = false;
      database = {
        name = "grafana";
        type = "postgres";
        user = "grafana";
        host = "/run/postgresql/";
      };
      server = {
        enable_gzip = true;
        http_addr = "127.0.0.1";
        http_port = 3000;
        root_url = "https://home.rix.si/grafana";
        serve_from_sub_path = true;
      };
    };
  };

  services.grafana-image-renderer = {
    enable = true;
    provisionGrafana = true;
  };

  services.postgresql.ensureDatabases = ["grafana"];
  services.postgresql.ensureUsers = [
    {
      name = "grafana";
      ensurePermissions = {
        "DATABASE grafana" = "ALL PRIVILEGES";
      };
    }
  ];

  services.nginx.virtualHosts."home.rix.si" = {
    locations."/prom" = {
      proxyPass = "http://${config.services.prometheus.listenAddress}:${toString config.services.prometheus.port}/prom";
      extraConfig = ''
        auth_basic           "closed site";
        auth_basic_user_file /srv/nginx-htpasswd;

      '';
    };

    locations."/grafana" = {
      proxyPass = "http://${config.services.grafana.settings.server.http_addr}:${toString config.services.grafana.settings.server.http_port}/grafana";
      extraConfig = ''
        proxy_set_header Host $host;
      '';

    };
  };
}

NEXT Alerting

NEXT ZFS

NEXT Disk

NEXT matrix