diff --git a/AGENTS.md b/AGENTS.md index 8d87aae..932a17f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -41,9 +41,12 @@ internal/ drift.go — CheckDrift, handlePush, periodicCheck reconciler.go — TriggerSync, handleDeploymentSucceeded/Failed federation/ — ActivityPub / ForgeFed (DATA LAYER ONLY — Phase 3F stub) + observability/ — Prometheus metrics + health (fully built — Phase 3E) + metrics.go — metric definitions, HTTP middleware, NATS watcher + health.go — Check() returning HealthStatus (DB ping + NATS) models/ — XORM structs + 13 migration files config/ — ENV-driven config, fails fast on missing secrets - events/ — NATS EventBus interface + NATSBus + NoOpBus + events/ — NATS EventBus interface + NATSBus + NoOpBus (Healthy() bool) web/ — //go:embed target for the built React SPA frontend/ src/ @@ -57,7 +60,7 @@ frontend/ **Middleware chain — this order is fixed, do not reorder:** ``` -Logger → RealIP → Recoverer → CORS → CSRF → SessionAuth → AuditLog → Handler +Logger → RealIP → Recoverer → Metrics → CORS → CSRF → SessionAuth → AuditLog → Handler ``` --- @@ -74,7 +77,7 @@ Logger → RealIP → Recoverer → CORS → CSRF → SessionAuth → AuditLog | 3B | Unified operational timeline | **Complete** | | 3C | Workspaces + secret management (Global → Workspace → Repo → Env) | **Complete** | | 3D | GitOps controller + drift detection + auto-sync | **Complete** | -| 3E | Observability (Prometheus endpoint, health checks, sparklines) | **Next** | +| 3E | Observability (Prometheus `/metrics`, structured `/health`, repo health API) | **Complete** | | 3F | Federation handlers (ActivityPub inbox/outbox) | Planned | | 4 | AI diagnostics, signed artifacts, OCI registry, dep/secret scanning | Planned | @@ -200,6 +203,9 @@ make lint # go vet + ESLint | `internal/domain/ci/executor.go` | Docker job executor + log streaming | | `internal/domain/gitops/controller.go` | GitOps reconciliation controller | | `internal/domain/gitops/drift.go` | `CheckDrift`, drift detection logic | +| `internal/observability/metrics.go` | Prometheus metric defs, HTTP middleware, NATS watcher | +| `internal/observability/health.go` | `Check()` — DB ping + NATS liveness | +| `internal/api/handlers/observability.go` | `/health` + `/repos/.../health` handlers | | `internal/api/handlers/environment.go` | Environment + deployment CRUD | | `internal/api/handlers/gitops.go` | GitOps config + drift HTTP endpoints | | `internal/api/handlers/secret.go` | Scoped secret management | diff --git a/CHANGELOG.md b/CHANGELOG.md index 8865565..8abfcca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,7 @@ Versions follow [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - Environment cards: live health status via HTTP health check polling - Repo page: error rate and deployment frequency sparklines -### Planned — Phase 3F (Federation) +### Planned — Phase 3F (Federation, next) - ActivityPub inbox/outbox HTTP handlers - HTTP signature verification middleware - WebFinger `/.well-known/webfinger` endpoint @@ -33,6 +33,44 @@ Versions follow [Semantic Versioning](https://semver.org/spec/v2.0.0.html). --- +## [0.8.0] — 2026-05-12 + +Phase 3E complete. Prometheus metrics, structured health checks, and per-repo operational health are operational. + +### Added — Prometheus Metrics (`internal/observability/`) +- `GET /metrics` — Prometheus text format endpoint (standard root-level path for k8s/Prometheus scraping) +- `GET /health` — upgraded from static `{"status":"ok"}` to a structured liveness response: + `{"status":"healthy","checks":{"database":"ok","nats":"ok"},"version":"0.8.0"}` + Returns HTTP 503 when any dependency is degraded +- `internal/observability/metrics.go` — metric definitions: + - `forgebucket_http_requests_total{method,path,status}` — counter + - `forgebucket_http_request_duration_seconds{method,path}` — histogram (Prometheus default buckets) + - `forgebucket_pipeline_runs_total{status}` — counter (succeeded/failed/cancelled), pre-initialized to 0 + - `forgebucket_deployments_total{status}` — counter (pending/success/failure/cancelled), pre-initialized to 0 + - `forgebucket_active_pipeline_runs` — gauge (in-flight runs) +- `internal/observability/health.go` — `Check(db, bus)` pings PostgreSQL and calls `bus.Healthy()` +- HTTP instrumentation middleware inserted after `Recoverer`, before `CORS` — records every request +- Path normalization prevents label cardinality explosion: `/repos/alice/myrepo/runs/42` → + `/api/v1/repos/:owner/:repo/runs/:id` +- NATS metric watcher subscribes to `pipeline.>` and `deployment.>` and increments counters + +### Added — Per-Repo Operational Health (`GET /api/v1/repos/{owner}/{repo}/health`) +- Returns a JSON summary for the repo page operational header: + - `ciPassRate7d` — fraction of pipeline runs that succeeded in the last 7 days + - `totalRuns7d` — total run count in the last 7 days + - `latestRun` — most recent `PipelineRun` record + - `latestDeployments` — one entry per environment showing latest deploy (envName, status, sha, finishedAt) + - `openDriftCount` — GitOpsConfigs in `drifted` state + - `openPRCount` — open pull request count + +### Added — EventBus `Healthy() bool` +- Added to the `EventBus` interface; `NATSBus` returns `nc.IsConnected()`; `NoOpBus` returns `true` + +### Changed — Middleware chain +- `observability.Middleware()` added between `Recoverer` and `CORS` (applies to all requests including `/health` and `/metrics`) + +--- + ## [0.7.0] — 2026-05-12 Phase 3D complete. Git is now the source of truth for environment deployment state. @@ -274,7 +312,8 @@ Initial development milestone. Core Git hosting, collaboration, and frontend SPA --- -[Unreleased]: https://github.com/forgeo/forgebucket/compare/v0.7.0...HEAD +[Unreleased]: https://github.com/forgeo/forgebucket/compare/v0.8.0...HEAD +[0.8.0]: https://github.com/forgeo/forgebucket/compare/v0.7.0...v0.8.0 [0.7.0]: https://github.com/forgeo/forgebucket/compare/v0.6.0...v0.7.0 [0.6.0]: https://github.com/forgeo/forgebucket/compare/v0.5.0...v0.6.0 [0.5.0]: https://github.com/forgeo/forgebucket/compare/v0.4.0...v0.5.0 diff --git a/README.md b/README.md index 7b85b4c..a91e753 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ForgeBucket is a self-hosted, federated developer operations platform. Where other Git platforms show you a list of files, ForgeBucket surfaces deployments, pipeline health, environment drift, and operational context directly alongside your code. Repositories are runtime systems. The dashboard is a command center. -**Status:** Active development. Phase 3D (GitOps controller + drift detection) complete. Phase 3E (observability) is next. +**Status:** Active development. Phase 3E (observability) complete. Phase 3F (federation handlers) is next. --- @@ -88,8 +88,12 @@ ForgeBucket is a self-hosted, federated developer operations platform. Where oth ### Observability + Security | Feature | Status | |---------|--------| -| Prometheus endpoint + health checks | Planned (Phase 3E) | -| Health sparklines in repo/env pages | Planned (Phase 3E) | +| `GET /health` — structured DB + NATS liveness check | Done | +| `GET /metrics` — Prometheus endpoint (HTTP + platform metrics) | Done | +| HTTP instrumentation middleware (latency histogram, request counter) | Done | +| Per-repo operational health summary (`GET /repos/.../health`) | Done | +| NATS-driven pipeline + deployment counters | Done | +| Health sparklines in repo/env pages (frontend) | Planned (Phase 4) | | Secret scanning | Planned (Phase 4) | | Dependency scanning | Planned (Phase 4) | | Signed artifacts (Sigstore/Cosign) | Planned (Phase 4) | @@ -138,6 +142,7 @@ ForgeBucket ├── Issue Service (issues — internal/api/handlers/) ├── CI Orchestrator (DAG execution, Docker runner — internal/domain/ci/) ├── GitOps Controller (drift detection, auto-sync — internal/domain/gitops/) +├── Observability (Prometheus metrics, health — internal/observability/) ├── Environment Service (environments, deployments — internal/api/handlers/environment.go) ├── Secret Manager (scoped AES-256-GCM — internal/api/handlers/secret.go) ├── Workspace Service (multi-tenant namespaces — internal/api/handlers/workspace.go) @@ -148,9 +153,9 @@ ForgeBucket └── Web Frontend (React 18 + TypeScript, //go:embed — web/) ``` -**Middleware chain (every authenticated request):** +**Middleware chain (every request):** ``` -Logger → RealIP → Recoverer → CORS → CSRF → SessionAuth → AuditLog → Handler +Logger → RealIP → Recoverer → Metrics → CORS → CSRF → SessionAuth → AuditLog → Handler ``` --- @@ -236,8 +241,8 @@ ForgeBucket has its own design language — intentionally distinct from GitHub a | Phase 3B | Unified operational timeline | Done | | Phase 3C | Workspaces + secret management hierarchy (Global → Workspace → Repo → Env) | Done | | Phase 3D | GitOps controller + drift detection + auto-sync | Done | -| Phase 3E | Observability (Prometheus endpoint, health checks, sparklines) | Next | -| Phase 3F | Federation handlers (ActivityPub inbox/outbox, cross-instance PRs) | Planned | +| Phase 3E | Observability (Prometheus `/metrics`, structured `/health`, repo health API) | Done | +| Phase 3F | Federation handlers (ActivityPub inbox/outbox, cross-instance PRs) | Next | | Phase 4 | AI diagnostics, signed artifacts, OCI registry, secret/dep scanning | Planned | --- diff --git a/cmd/forgebucket/main.go b/cmd/forgebucket/main.go index 9169728..6c0ab75 100644 --- a/cmd/forgebucket/main.go +++ b/cmd/forgebucket/main.go @@ -21,6 +21,7 @@ import ( gitdomain "github.com/forgeo/forgebucket/internal/domain/git" "github.com/forgeo/forgebucket/internal/domain/gitops" "github.com/forgeo/forgebucket/internal/events" + "github.com/forgeo/forgebucket/internal/observability" "github.com/forgeo/forgebucket/internal/models/migrations" "github.com/forgeo/forgebucket/web" ) @@ -77,6 +78,8 @@ func main() { gitopsCtrl := gitops.NewController(engine, bus, cfg) go gitopsCtrl.Start(ciCtx) + go observability.StartNATSWatcher(ciCtx, bus) + handler := api.New(cfg, engine, store, bus, cfg.ArtifactRoot, web.FS()) srv := &http.Server{ diff --git a/go.mod b/go.mod index ee0c757..fe39ebd 100644 --- a/go.mod +++ b/go.mod @@ -8,22 +8,31 @@ require ( github.com/gorilla/sessions v1.4.0 github.com/joho/godotenv v1.5.1 github.com/lib/pq v1.12.3 + github.com/nats-io/nats.go v1.52.0 + github.com/prometheus/client_golang v1.23.2 golang.org/x/crypto v0.50.0 + gopkg.in/yaml.v3 v3.0.1 nhooyr.io/websocket v1.8.17 xorm.io/xorm v1.3.11 ) require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/goccy/go-json v0.10.5 // indirect github.com/golang/snappy v0.0.4 // indirect github.com/gorilla/securecookie v1.1.2 // indirect github.com/klauspost/compress v1.18.5 // indirect - github.com/nats-io/nats.go v1.52.0 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/nats-io/nkeys v0.4.15 // indirect github.com/nats-io/nuid v1.0.1 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/procfs v0.16.1 // indirect github.com/syndtr/goleveldb v1.0.0 // indirect + go.yaml.in/yaml/v2 v2.4.2 // indirect golang.org/x/sys v0.43.0 // indirect golang.org/x/tools v0.43.0 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect + google.golang.org/protobuf v1.36.8 // indirect xorm.io/builder v0.3.13 // indirect ) diff --git a/go.sum b/go.sum index db70021..5093797 100644 --- a/go.sum +++ b/go.sum @@ -2,6 +2,10 @@ filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= gitea.com/xorm/sqlfiddle v0.0.0-20180821085327-62ce714f951a h1:lSA0F4e9A2NcQSqGqTOXqu2aRi/XEQxDCBwM8yJtE6s= gitea.com/xorm/sqlfiddle v0.0.0-20180821085327-62ce714f951a/go.mod h1:EXuID2Zs0pAQhH8yz+DNjUbjppKQzKFAn28TMYPB6IU= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -20,6 +24,8 @@ github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5y github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= @@ -36,12 +42,20 @@ github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNU github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8= github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE= github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/lib/pq v1.12.3 h1:tTWxr2YLKwIvK90ZXEw8GP7UFHtcbTtty8zsI+YjrfQ= github.com/lib/pq v1.12.3/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs= github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/nats-io/nats.go v1.52.0 h1:n3avV4VBsCgsdwh71TppsTwtv+QdPs7ntSKM8qJLGsc= github.com/nats-io/nats.go v1.52.0/go.mod h1:26HypzazeOkyO3/mqd1zZd53STJN0EjCYF9Uy2ZOBno= github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4= @@ -57,14 +71,28 @@ github.com/onsi/gomega v1.4.3 h1:RE1xgDvH7imwFD45h+u2SgIfERHlS2yNG4DObb5BSKU= github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= +github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= +github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= +github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE= github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= +go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI= golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q= golang.org/x/mod v0.34.0 h1:xIHgNUUnW6sYkcM5Jleh05DvLOtwc6RitGHbDk4akRI= @@ -78,12 +106,18 @@ golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI= golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/term v0.42.0 h1:UiKe+zDFmJobeJ5ggPwOshJIVt6/Ft0rcfrXZDLWAWY= +golang.org/x/term v0.42.0/go.mod h1:Dq/D+snpsbazcBG5+F9Q1n2rXV8Ma+71xEjTRufARgY= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164= golang.org/x/tools v0.43.0 h1:12BdW9CeB3Z+J/I/wj34VMl8X+fEXBxVR90JeMX5E7s= golang.org/x/tools v0.43.0/go.mod h1:uHkMso649BX2cZK6+RpuIPXS3ho2hZo4FVwfoy1vIk0= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4= gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= diff --git a/internal/api/handlers/observability.go b/internal/api/handlers/observability.go new file mode 100644 index 0000000..b9e2919 --- /dev/null +++ b/internal/api/handlers/observability.go @@ -0,0 +1,126 @@ +package handlers + +import ( + "net/http" + "time" + + "xorm.io/xorm" + + "github.com/forgeo/forgebucket/internal/events" + "github.com/forgeo/forgebucket/internal/models" + "github.com/forgeo/forgebucket/internal/observability" +) + +// ── /health ─────────────────────────────────────────────────────────────────── + +type HealthHandler struct { + db *xorm.Engine + bus events.EventBus +} + +func NewHealthHandler(db *xorm.Engine, bus events.EventBus) *HealthHandler { + return &HealthHandler{db: db, bus: bus} +} + +func (h *HealthHandler) Health(w http.ResponseWriter, r *http.Request) { + status := observability.Check(h.db, h.bus) + if status.Status != "healthy" { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusServiceUnavailable) + jsonOK(w, status) + return + } + jsonOK(w, status) +} + +// ── /api/v1/repos/{owner}/{repo}/health ────────────────────────────────────── + +type RepoHealthHandler struct{ db *xorm.Engine } + +func NewRepoHealthHandler(db *xorm.Engine) *RepoHealthHandler { + return &RepoHealthHandler{db: db} +} + +type latestDeployment struct { + EnvName string `json:"envName"` + Status string `json:"status"` + SHA string `json:"sha"` + FinishedAt *time.Time `json:"finishedAt"` +} + +type repoHealthResponse struct { + CIPassRate7d float64 `json:"ciPassRate7d"` + TotalRuns7d int `json:"totalRuns7d"` + LatestRun *models.PipelineRun `json:"latestRun"` + LatestDeployments []latestDeployment `json:"latestDeployments"` + OpenDriftCount int `json:"openDriftCount"` + OpenPRCount int `json:"openPRCount"` +} + +// Get returns an operational health summary for a repository. +// This feeds the repo page header: CI pass rate, latest deploy per env, drift count. +func (h *RepoHealthHandler) Get(w http.ResponseWriter, r *http.Request) { + repoID, ok := resolveRepoID(h.db, w, r) + if !ok { + return + } + + since7d := time.Now().UTC().Add(-7 * 24 * time.Hour) + + // CI pass rate over last 7 days. + var runs []models.PipelineRun + h.db.Where("repo_id = ? AND created_at >= ?", repoID, since7d).Find(&runs) + total := len(runs) + succeeded := 0 + for _, run := range runs { + if run.Status == "succeeded" { + succeeded++ + } + } + var passRate float64 + if total > 0 { + passRate = float64(succeeded) / float64(total) + } + + // Latest run overall. + var latestRun models.PipelineRun + var hasLatest bool + hasLatest, _ = h.db.Where("repo_id = ?", repoID).Desc("id").Limit(1).Get(&latestRun) + + // Latest deployment per environment. + var envs []models.Environment + h.db.Where("repo_id = ?", repoID).Find(&envs) + deploys := make([]latestDeployment, 0, len(envs)) + for _, env := range envs { + var d models.Deployment + if found, _ := h.db.Where("env_id = ?", env.ID).Desc("id").Limit(1).Get(&d); found { + deploys = append(deploys, latestDeployment{ + EnvName: env.Name, + Status: string(d.Status), + SHA: d.SHA, + FinishedAt: d.FinishedAt, + }) + } + } + + // Open drift count (GitOpsConfigs where sync_status = 'drifted'). + driftCount, _ := h.db.Where("repo_id = ? AND sync_status = 'drifted'", repoID). + Count(&models.GitOpsConfig{}) + + // Open PR count. + prCount, _ := h.db.Where("repo_id = ? AND status = 'open'", repoID). + Count(&models.PullRequest{}) + + resp := repoHealthResponse{ + CIPassRate7d: passRate, + TotalRuns7d: total, + LatestDeployments: deploys, + OpenDriftCount: int(driftCount), + OpenPRCount: int(prCount), + } + if hasLatest { + resp.LatestRun = &latestRun + } + + jsonOK(w, resp) +} diff --git a/internal/api/router.go b/internal/api/router.go index c582754..5ecc8b4 100644 --- a/internal/api/router.go +++ b/internal/api/router.go @@ -14,10 +14,13 @@ import ( "github.com/gorilla/sessions" "xorm.io/xorm" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/forgeo/forgebucket/internal/api/handlers" "github.com/forgeo/forgebucket/internal/api/middleware" "github.com/forgeo/forgebucket/internal/config" "github.com/forgeo/forgebucket/internal/events" + "github.com/forgeo/forgebucket/internal/observability" ) func New(cfg *config.Config, engine *xorm.Engine, store sessions.Store, bus events.EventBus, artifactRoot string, staticFiles fs.FS) http.Handler { @@ -26,6 +29,7 @@ func New(cfg *config.Config, engine *xorm.Engine, store sessions.Store, bus even r.Use(chimiddleware.Logger) r.Use(chimiddleware.RealIP) r.Use(chimiddleware.Recoverer) + r.Use(observability.Middleware()) r.Use(cors.Handler(cors.Options{ AllowedOrigins: []string{"http://localhost:5173", cfg.InstanceURL}, AllowedMethods: []string{"GET", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"}, @@ -53,9 +57,11 @@ func New(cfg *config.Config, engine *xorm.Engine, store sessions.Store, bus even webhookH := handlers.NewWebhookHandler(engine) prSettingsH := handlers.NewPRSettingsHandler(engine) lfsH := handlers.NewLFSHandler(engine) - exploreH := handlers.NewExploreHandler(engine) - dashH := handlers.NewDashboardHandler(engine) - auditH := handlers.NewAuditHandler(engine) + exploreH := handlers.NewExploreHandler(engine) + dashH := handlers.NewDashboardHandler(engine) + auditH := handlers.NewAuditHandler(engine) + healthH := handlers.NewHealthHandler(engine, bus) + repoHealthH := handlers.NewRepoHealthHandler(engine) artifactH := handlers.NewArtifactHandler(engine, artifactRoot) runnerH := handlers.NewRunnerHandler(engine) gitopsH := handlers.NewGitOpsHandler(engine, bus) @@ -74,17 +80,16 @@ func New(cfg *config.Config, engine *xorm.Engine, store sessions.Store, bus even r.Post("/git-receive-pack", gitH.ServeGit) }) + // ── Ops endpoints (root-level, no auth, standard paths for k8s/Prometheus) ── + r.Get("/health", healthH.Health) + r.Get("/metrics", promhttp.Handler().ServeHTTP) + r.Route("/api/v1", func(r chi.Router) { // ── Public ──────────────────────────────────────────────────────────── r.Get("/explore/repos", exploreH.Repos) r.Get("/explore/users", exploreH.Users) - r.Get("/health", func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") - w.Write([]byte(`{"status":"ok"}`)) - }) - // Generates a CSRF token + cookie. SPA calls this once on load. r.Get("/csrf", func(w http.ResponseWriter, r *http.Request) { token, err := middleware.NewCSRFToken(w, !cfg.Debug) @@ -240,6 +245,7 @@ func New(cfg *config.Config, engine *xorm.Engine, store sessions.Store, bus even r.With(csrf).Delete("/secrets/{name}", secretH.DeleteRepoSecret) r.Get("/lfs-settings", lfsH.Get) r.With(csrf).Put("/lfs-settings", lfsH.Update) + r.Get("/health", repoHealthH.Get) r.Route("/environments", func(r chi.Router) { r.Get("/", envH.ListEnvironments) r.With(csrf).Post("/", envH.CreateEnvironment) diff --git a/internal/events/bus.go b/internal/events/bus.go index 6e94f98..ee39a96 100644 --- a/internal/events/bus.go +++ b/internal/events/bus.go @@ -16,6 +16,7 @@ import ( type EventBus interface { Publish(subject string, payload any) error Subscribe(subject string, handler func(subject string, data []byte)) (func(), error) + Healthy() bool Close() } @@ -63,6 +64,8 @@ func (b *NATSBus) Subscribe(subject string, handler func(subject string, data [] return func() { sub.Unsubscribe() }, nil //nolint:errcheck } +func (b *NATSBus) Healthy() bool { return b.nc.IsConnected() } + func (b *NATSBus) Close() { if err := b.nc.Drain(); err != nil { log.Printf("nats: drain: %v", err) @@ -75,6 +78,7 @@ type NoOpBus struct{} func (NoOpBus) Publish(_ string, _ any) error { return nil } func (NoOpBus) Subscribe(_ string, _ func(string, []byte)) (func(), error) { return func() {}, nil } +func (NoOpBus) Healthy() bool { return true } func (NoOpBus) Close() {} // New returns a NATSBus if url is non-empty, otherwise a NoOpBus. diff --git a/internal/observability/health.go b/internal/observability/health.go new file mode 100644 index 0000000..835b366 --- /dev/null +++ b/internal/observability/health.go @@ -0,0 +1,52 @@ +package observability + +import ( + "fmt" + + "xorm.io/xorm" + + "github.com/forgeo/forgebucket/internal/events" +) + +const Version = "0.8.0" + +// HealthStatus is the response shape for GET /health. +type HealthStatus struct { + Status string `json:"status"` // "healthy" | "degraded" + Checks map[string]string `json:"checks"` // dependency name → "ok" | error message + Version string `json:"version"` +} + +// Check pings each critical dependency and returns a HealthStatus. +// HTTP status should be 200 when Status=="healthy", 503 when "degraded". +func Check(db *xorm.Engine, bus events.EventBus) HealthStatus { + checks := make(map[string]string, 2) + + // Database — attempt a lightweight ping. + if err := db.Ping(); err != nil { + checks["database"] = fmt.Sprintf("error: %v", err) + } else { + checks["database"] = "ok" + } + + // NATS — use the Healthy() method added in Phase 3E. + if bus.Healthy() { + checks["nats"] = "ok" + } else { + checks["nats"] = "disconnected" + } + + overall := "healthy" + for _, v := range checks { + if v != "ok" { + overall = "degraded" + break + } + } + + return HealthStatus{ + Status: overall, + Checks: checks, + Version: Version, + } +} diff --git a/internal/observability/metrics.go b/internal/observability/metrics.go new file mode 100644 index 0000000..0914da3 --- /dev/null +++ b/internal/observability/metrics.go @@ -0,0 +1,172 @@ +package observability + +import ( + "context" + "encoding/json" + "log" + "net/http" + "regexp" + "strconv" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + + "github.com/forgeo/forgebucket/internal/events" +) + +// ── Metric definitions ──────────────────────────────────────────────────────── + +var ( + HttpRequestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "forgebucket_http_requests_total", + Help: "Total HTTP requests by method, normalized path, and status code.", + }, []string{"method", "path", "status"}) + + HttpRequestDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "forgebucket_http_request_duration_seconds", + Help: "HTTP request latency by method and normalized path.", + Buckets: prometheus.DefBuckets, + }, []string{"method", "path"}) + + PipelineRunsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "forgebucket_pipeline_runs_total", + Help: "Pipeline runs by terminal status.", + }, []string{"status"}) + + DeploymentsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "forgebucket_deployments_total", + Help: "Deployments by terminal status.", + }, []string{"status"}) + + ActivePipelineRuns = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "forgebucket_active_pipeline_runs", + Help: "Pipeline runs currently in queued or running state.", + }) +) + +func init() { + // Pre-initialize all label combinations so the metrics are visible in + // /metrics immediately from startup (no gaps on first scrape). + for _, s := range []string{"succeeded", "failed", "cancelled"} { + PipelineRunsTotal.With(prometheus.Labels{"status": s}) + } + for _, s := range []string{"pending", "success", "failure", "cancelled"} { + DeploymentsTotal.With(prometheus.Labels{"status": s}) + } +} + +// ── HTTP instrumentation middleware ────────────────────────────────────────── + +type statusRecorder struct { + http.ResponseWriter + status int +} + +func (r *statusRecorder) WriteHeader(code int) { + r.status = code + r.ResponseWriter.WriteHeader(code) +} + +// Middleware records request count and latency for every HTTP request. +// Path labels are normalized to prevent high cardinality (numeric segments +// and positional path variables are replaced with placeholder tokens). +func Middleware() func(http.Handler) http.Handler { + return func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + start := time.Now() + rec := &statusRecorder{ResponseWriter: w, status: http.StatusOK} + next.ServeHTTP(rec, r) + + path := normalizePath(r.URL.Path) + status := strconv.Itoa(rec.status) + elapsed := time.Since(start).Seconds() + + HttpRequestsTotal.WithLabelValues(r.Method, path, status).Inc() + HttpRequestDuration.WithLabelValues(r.Method, path).Observe(elapsed) + }) + } +} + +// normalizePath replaces volatile path segments with placeholders so that +// Prometheus label cardinality stays bounded. +// +// Examples: +// +// /api/v1/repos/alice/myrepo/runs/42/jobs/7/logs +// → /api/v1/repos/:owner/:repo/runs/:id/jobs/:id/logs +// +// /alice/myrepo.git/info/refs +// → /:owner/:repo.git/info/refs +var reNumeric = regexp.MustCompile(`/\d+`) + +func normalizePath(path string) string { + // Replace all-numeric segments first. + path = reNumeric.ReplaceAllString(path, "/:id") + + // Normalize repo smart-HTTP paths: /{owner}/{repo}.git/... + path = reGitPath.ReplaceAllString(path, "/:owner/:repo.git$1") + + // Normalize /api/v1/repos/{owner}/{repo}/... + path = reRepoPath.ReplaceAllString(path, "/api/v1/repos/:owner/:repo$1") + + // Normalize /api/v1/workspaces/{handle}/... + path = reWorkspacePath.ReplaceAllString(path, "/api/v1/workspaces/:handle$1") + + return path +} + +var ( + reGitPath = regexp.MustCompile(`^/[^/]+/[^/]+\.git(/.*)$`) + reRepoPath = regexp.MustCompile(`^/api/v1/repos/[^/]+/[^/]+(/.*)$`) + reWorkspacePath = regexp.MustCompile(`^/api/v1/workspaces/[^/]+(/.*)$`) +) + +// ── NATS event watcher ──────────────────────────────────────────────────────── + +// StartNATSWatcher subscribes to pipeline and deployment NATS events and +// increments the corresponding Prometheus counters. Runs until ctx is cancelled. +func StartNATSWatcher(ctx context.Context, bus events.EventBus) { + type statusPayload struct { + Status string `json:"status"` + } + + unsub1, err := bus.Subscribe("pipeline.>", func(subject string, data []byte) { + switch subject { + case events.SubjectPipelineTriggered: + ActivePipelineRuns.Inc() + case events.SubjectPipelineCompleted: + ActivePipelineRuns.Dec() + PipelineRunsTotal.WithLabelValues("succeeded").Inc() + case events.SubjectPipelineFailed: + ActivePipelineRuns.Dec() + PipelineRunsTotal.WithLabelValues("failed").Inc() + } + }) + if err != nil { + log.Printf("observability: subscribe pipeline.*: %v", err) + } else { + defer unsub1() + } + + unsub2, err := bus.Subscribe("deployment.>", func(subject string, data []byte) { + var p statusPayload + json.Unmarshal(data, &p) //nolint:errcheck + switch subject { + case events.SubjectDeploymentSucceeded: + DeploymentsTotal.WithLabelValues("success").Inc() + case events.SubjectDeploymentFailed: + DeploymentsTotal.WithLabelValues("failure").Inc() + case events.SubjectDeploymentStarted: + DeploymentsTotal.WithLabelValues("pending").Inc() + } + }) + if err != nil { + log.Printf("observability: subscribe deployment.*: %v", err) + } else { + defer unsub2() + } + + log.Printf("observability: NATS metric watcher started") + <-ctx.Done() +}