complete-computing-environment/wobserver-observability.org

182 lines
5.0 KiB
Org Mode

:PROPERTIES:
:ID: 20220101T190353.843667
:ROAM_ALIASES: Wobservability
:END:
#+title: Wobserver Observability
#+filetags: :Wobserver:Development:
#+ARCOLOGY_ALLOW_CRAWL: t
#+ARCOLOGY_KEY: cce/wobserver/observability
#+AUTO_TANGLE: t
#+ARROYO_NIXOS_MODULE: nixos/wobservability.nix
#+ARROYO_SYSTEM_ROLE: server
It is critical that the [[id:20211120T220054.226284][Wobserver]] can be "self-healing" -- not that all faults can be automatically fixed, but that with a human in the loop all common failure modes can be managed and normal operation can be maintained.
Most of the problems are disk usage and IO utilization, failed disks and the like.
* INPROGRESS observability
:LOGBOOK:
- State "INPROGRESS" from [2022-01-01 Sat 16:43]
:END:
I keep wanting to set up telegraf, but for now I'll use =prometheus= since it makes it easy to set up Prometheus exporters like [[id:20220101T180015.306163][Nextcloud on Wobserver]] shows for example.
I need to set up alerts and dashboards for the most common operations, and I'd love some declarative way to share these. Oh some day.
#+begin_src nix :tangle ~/arroyo-nix/nixos/wobservability.nix
{ pkgs, config, ... }:
with pkgs.lib;
let mkStaticScrape =
(name: cfg:
let addr =
if hasAttr "listenAddr" cfg then
cfg.listenAddr
else
"localhost";
in
{
job_name = name;
static_configs = [
{ targets = ["${addr}:${toString cfg.port}"]; }
];
}
);
in
rec {
programs.atop.enable = true;
services.prometheus = {
enable = true;
retentionTime = "60d";
webExternalUrl = "https://home.rix.si/prom";
listenAddress = "127.0.0.1";
scrapeConfigs = [
(mkStaticScrape "node" config.services.prometheus.exporters.node)
(mkStaticScrape "process" config.services.prometheus.exporters.process)
(mkStaticScrape "smartctl" config.services.prometheus.exporters.smartctl)
(mkStaticScrape "postgres" config.services.prometheus.exporters.postgres)
(mkStaticScrape "zfs" config.services.prometheus.exporters.zfs)
{
job_name = "octopi";
metrics_path = "/plugin/prometheus_exporter/metrics";
params.apikey = ["27816EF9BA5C43749A022573B0862C71"];
static_configs = [{ targets = ["octopi:80"]; }];
}
{
job_name = "arcology";
static_configs = [{ targets = ["localhost:8000"]; }];
}
{
job_name= "synapse";
metrics_path= "/_synapse/metrics";
static_configs = [{ targets = ["localhost:8008"]; }];
}
# gitea
];
};
# services.prometheus.exporters.pihole = {};
# services.prometheus.exporters.nginx = {};
# services.prometheus.exporters.nginxlog = {};
# services.prometheus.exporters.unifi = {};
services.prometheus.exporters.node = {
enable = true;
enabledCollectors = [ "systemd" ];
};
services.prometheus.exporters.process = {
enable = true;
settings.process_names = [
# Remove nix store path from process name
{
name = "{{.Matches.Wrapped}} {{ .Matches.Args }}";
cmdline = [ "^/nix/store[^ ]*/(?P<Wrapped>[^ /]*) (?P<Args>.*)" ];
}
{
name = "{{.Matches.Command}}: {{ .Matches.Specialization }}";
cmdline = [ "(?P<Command>[a-zA-Z0-9\-_+]+): (?P<Specialization>.*)" ];
}
];
};
services.prometheus.exporters.smartctl.enable = true;
services.smartd.enable = true;
environment.systemPackages = [ pkgs.smartmontools ];
services.prometheus.exporters.postgres = {
enable = true;
runAsLocalSuperUser = true;
};
services.prometheus.exporters.zfs.enable = true;
services.grafana = {
enable = true;
dataDir = "/srv/grafana";
settings = {
analytics.reporting_enable = false;
database = {
name = "grafana";
type = "postgres";
user = "grafana";
host = "/run/postgresql/";
};
server = {
enable_gzip = true;
http_addr = "127.0.0.1";
http_port = 3000;
root_url = "https://home.rix.si/grafana";
serve_from_sub_path = true;
};
};
};
services.grafana-image-renderer = {
enable = true;
provisionGrafana = true;
};
services.postgresql.ensureDatabases = ["grafana"];
services.postgresql.ensureUsers = [
{
name = "grafana";
ensurePermissions = {
"DATABASE grafana" = "ALL PRIVILEGES";
};
}
];
services.nginx.virtualHosts."home.rix.si" = {
locations."/prom" = {
proxyPass = "http://${config.services.prometheus.listenAddress}:${toString config.services.prometheus.port}/prom";
extraConfig = ''
auth_basic "closed site";
auth_basic_user_file /srv/nginx-htpasswd;
'';
};
locations."/grafana" = {
proxyPass = "http://${config.services.grafana.settings.server.http_addr}:${toString config.services.grafana.settings.server.http_port}/grafana";
extraConfig = ''
proxy_set_header Host $host;
'';
};
};
}
#+end_src
** NEXT Alerting
** NEXT ZFS
** NEXT Disk
** NEXT matrix