182 lines
5.0 KiB
Org Mode
182 lines
5.0 KiB
Org Mode
:PROPERTIES:
|
|
:ID: 20220101T190353.843667
|
|
:ROAM_ALIASES: Wobservability
|
|
:END:
|
|
#+title: Wobserver Observability
|
|
#+filetags: :Wobserver:Development:
|
|
|
|
#+ARCOLOGY_ALLOW_CRAWL: t
|
|
#+ARCOLOGY_KEY: cce/wobserver/observability
|
|
|
|
#+AUTO_TANGLE: t
|
|
|
|
#+ARROYO_NIXOS_MODULE: nixos/wobservability.nix
|
|
#+ARROYO_SYSTEM_ROLE: server
|
|
|
|
It is critical that the [[id:20211120T220054.226284][Wobserver]] can be "self-healing" -- not that all faults can be automatically fixed, but that with a human in the loop all common failure modes can be managed and normal operation can be maintained.
|
|
|
|
Most of the problems are disk usage and IO utilization, failed disks and the like.
|
|
|
|
* INPROGRESS observability
|
|
:LOGBOOK:
|
|
- State "INPROGRESS" from [2022-01-01 Sat 16:43]
|
|
:END:
|
|
|
|
I keep wanting to set up telegraf, but for now I'll use =prometheus= since it makes it easy to set up Prometheus exporters like [[id:20220101T180015.306163][Nextcloud on Wobserver]] shows for example.
|
|
|
|
I need to set up alerts and dashboards for the most common operations, and I'd love some declarative way to share these. Oh some day.
|
|
|
|
#+begin_src nix :tangle ~/arroyo-nix/nixos/wobservability.nix
|
|
{ pkgs, config, ... }:
|
|
|
|
with pkgs.lib;
|
|
|
|
let mkStaticScrape =
|
|
(name: cfg:
|
|
let addr =
|
|
if hasAttr "listenAddr" cfg then
|
|
cfg.listenAddr
|
|
else
|
|
"localhost";
|
|
in
|
|
{
|
|
job_name = name;
|
|
static_configs = [
|
|
{ targets = ["${addr}:${toString cfg.port}"]; }
|
|
];
|
|
}
|
|
);
|
|
in
|
|
rec {
|
|
programs.atop.enable = true;
|
|
services.prometheus = {
|
|
enable = true;
|
|
retentionTime = "60d";
|
|
webExternalUrl = "https://home.rix.si/prom";
|
|
listenAddress = "127.0.0.1";
|
|
scrapeConfigs = [
|
|
(mkStaticScrape "node" config.services.prometheus.exporters.node)
|
|
(mkStaticScrape "process" config.services.prometheus.exporters.process)
|
|
(mkStaticScrape "smartctl" config.services.prometheus.exporters.smartctl)
|
|
(mkStaticScrape "postgres" config.services.prometheus.exporters.postgres)
|
|
(mkStaticScrape "zfs" config.services.prometheus.exporters.zfs)
|
|
{
|
|
job_name = "octopi";
|
|
metrics_path = "/plugin/prometheus_exporter/metrics";
|
|
params.apikey = ["27816EF9BA5C43749A022573B0862C71"];
|
|
static_configs = [{ targets = ["octopi:80"]; }];
|
|
}
|
|
{
|
|
job_name = "arcology";
|
|
static_configs = [{ targets = ["localhost:8000"]; }];
|
|
}
|
|
{
|
|
job_name= "synapse";
|
|
metrics_path= "/_synapse/metrics";
|
|
static_configs = [{ targets = ["localhost:8008"]; }];
|
|
}
|
|
# gitea
|
|
];
|
|
};
|
|
|
|
# services.prometheus.exporters.pihole = {};
|
|
# services.prometheus.exporters.nginx = {};
|
|
# services.prometheus.exporters.nginxlog = {};
|
|
# services.prometheus.exporters.unifi = {};
|
|
|
|
services.prometheus.exporters.node = {
|
|
enable = true;
|
|
enabledCollectors = [ "systemd" ];
|
|
};
|
|
|
|
services.prometheus.exporters.process = {
|
|
enable = true;
|
|
settings.process_names = [
|
|
# Remove nix store path from process name
|
|
{
|
|
name = "{{.Matches.Wrapped}} {{ .Matches.Args }}";
|
|
cmdline = [ "^/nix/store[^ ]*/(?P<Wrapped>[^ /]*) (?P<Args>.*)" ];
|
|
}
|
|
{
|
|
name = "{{.Matches.Command}}: {{ .Matches.Specialization }}";
|
|
cmdline = [ "(?P<Command>[a-zA-Z0-9\-_+]+): (?P<Specialization>.*)" ];
|
|
}
|
|
];
|
|
};
|
|
|
|
services.prometheus.exporters.smartctl.enable = true;
|
|
services.smartd.enable = true;
|
|
environment.systemPackages = [ pkgs.smartmontools ];
|
|
|
|
services.prometheus.exporters.postgres = {
|
|
enable = true;
|
|
runAsLocalSuperUser = true;
|
|
};
|
|
services.prometheus.exporters.zfs.enable = true;
|
|
|
|
services.grafana = {
|
|
enable = true;
|
|
dataDir = "/srv/grafana";
|
|
|
|
settings = {
|
|
analytics.reporting_enable = false;
|
|
database = {
|
|
name = "grafana";
|
|
type = "postgres";
|
|
user = "grafana";
|
|
host = "/run/postgresql/";
|
|
};
|
|
server = {
|
|
enable_gzip = true;
|
|
http_addr = "127.0.0.1";
|
|
http_port = 3000;
|
|
root_url = "https://home.rix.si/grafana";
|
|
serve_from_sub_path = true;
|
|
};
|
|
};
|
|
};
|
|
|
|
services.grafana-image-renderer = {
|
|
enable = true;
|
|
provisionGrafana = true;
|
|
};
|
|
|
|
services.postgresql.ensureDatabases = ["grafana"];
|
|
services.postgresql.ensureUsers = [
|
|
{
|
|
name = "grafana";
|
|
ensurePermissions = {
|
|
"DATABASE grafana" = "ALL PRIVILEGES";
|
|
};
|
|
}
|
|
];
|
|
|
|
services.nginx.virtualHosts."home.rix.si" = {
|
|
locations."/prom" = {
|
|
proxyPass = "http://${config.services.prometheus.listenAddress}:${toString config.services.prometheus.port}/prom";
|
|
extraConfig = ''
|
|
auth_basic "closed site";
|
|
auth_basic_user_file /srv/nginx-htpasswd;
|
|
|
|
'';
|
|
};
|
|
|
|
locations."/grafana" = {
|
|
proxyPass = "http://${config.services.grafana.settings.server.http_addr}:${toString config.services.grafana.settings.server.http_port}/grafana";
|
|
extraConfig = ''
|
|
proxy_set_header Host $host;
|
|
'';
|
|
|
|
};
|
|
};
|
|
}
|
|
#+end_src
|
|
|
|
** NEXT Alerting
|
|
|
|
** NEXT ZFS
|
|
|
|
** NEXT Disk
|
|
|
|
** NEXT matrix
|