From 14f9bdaa252ea4f2a7f3068c542e06d01cf5f9f1 Mon Sep 17 00:00:00 2001 From: Oded Ben Ozer Date: Fri, 29 Nov 2024 13:27:17 +0100 Subject: [PATCH 01/16] Initial commit for monitoring PR handling failures. We both explicit failures with pr_handle_failures_total And cases where telefonistka commit status check is left in "pending" state (because Telefonistka exploded while handling that event ) --- cmd/telefonistka/server.go | 8 +++ internal/pkg/githubapi/github.go | 1 + internal/pkg/githubapi/pr_metrics.go | 88 +++++++++++++++++++++++++++ internal/pkg/prometheus/prometheus.go | 41 +++++++++++++ 4 files changed, 138 insertions(+) create mode 100644 internal/pkg/githubapi/pr_metrics.go diff --git a/cmd/telefonistka/server.go b/cmd/telefonistka/server.go index 076b28f..005aa7e 100644 --- a/cmd/telefonistka/server.go +++ b/cmd/telefonistka/server.go @@ -57,6 +57,14 @@ func serve() { mainGhClientCache, _ := lru.New[string, githubapi.GhClientPair](128) prApproverGhClientCache, _ := lru.New[string, githubapi.GhClientPair](128) + go func() { + for { + log.Debugf("Stale check runner") + githubapi.GetPrMetrics(mainGhClientCache) + time.Sleep(60 * time.Second) //TODO: make this configurable? GH API rate limits are a facor here + } + }() + mux := http.NewServeMux() mux.HandleFunc("/webhook", handleWebhook(githubWebhookSecret, mainGhClientCache, prApproverGhClientCache)) mux.Handle("/metrics", promhttp.Handler()) diff --git a/internal/pkg/githubapi/github.go b/internal/pkg/githubapi/github.go index f837299..e6b754a 100644 --- a/internal/pkg/githubapi/github.go +++ b/internal/pkg/githubapi/github.go @@ -146,6 +146,7 @@ func HandlePREvent(eventPayload *github.PullRequestEvent, ghPrClientDetails GhPr defer func() { if err != nil { SetCommitStatus(ghPrClientDetails, "error") + prom.IncPrHandleFailuresCounter(ghPrClientDetails.Owner + "/" + ghPrClientDetails.Repo) return } SetCommitStatus(ghPrClientDetails, "success") diff --git a/internal/pkg/githubapi/pr_metrics.go b/internal/pkg/githubapi/pr_metrics.go new file mode 100644 index 0000000..bc1d6c8 --- /dev/null +++ b/internal/pkg/githubapi/pr_metrics.go @@ -0,0 +1,88 @@ +package githubapi + +import ( + "context" + "time" + + "github.com/google/go-github/v62/github" + lru "github.com/hashicorp/golang-lru/v2" + log "github.com/sirupsen/logrus" + prom "github.com/wayfair-incubator/telefonistka/internal/pkg/prometheus" +) + +const ( + minutesToDfineStale = 20 +) + +func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.Repository) (prWithStakeChecks int, openPRs int, openPromotionPrs int, err error) { + + log.Debugf("Checking repo %s", repo.GetName()) + ghOwner := repo.GetOwner().GetLogin() + prListOpts := &github.PullRequestListOptions{ + State: "open", + } + prs := []*github.PullRequest{} + + // paginate through PRs, there might be lot of them. + for { + perPagePrs, resp, err := ghClient.v3Client.PullRequests.List(ctx, ghOwner, repo.GetName(), prListOpts) + _ = prom.InstrumentGhCall(resp) + if err != nil { + log.Errorf("error getting PRs for %s/%s: %v", ghOwner, repo.GetName(), err) + } + prs = append(prs, perPagePrs...) + if resp.NextPage == 0 { + break + } + prListOpts.Page = resp.NextPage + } + + for _, pr := range prs { + if DoesPrHasLabel(pr.Labels, "promotion") { + openPromotionPrs++ + } + log.Debugf("Checking PR %d", pr.GetNumber()) + commitStatuses, resp, err := ghClient.v3Client.Repositories.GetCombinedStatus(ctx, ghOwner, repo.GetName(), pr.GetHead().GetSHA(), nil) + _ = prom.InstrumentGhCall(resp) + if err != nil { + log.Errorf("error getting statuses for %s/%s/%d: %v", ghOwner, repo.GetName(), pr.GetNumber(), err) + continue + } + for _, status := range commitStatuses.Statuses { + if *status.Context == "telefonistka" && + *status.State == "pending" && + status.UpdatedAt.GetTime().Before(time.Now().Add(-time.Minute*minutesToDfineStale)) { + log.Debugf("Adding status %s-%v-%s !!!", *status.Context, status.UpdatedAt.GetTime(), *status.State) + prWithStakeChecks++ + } else { + log.Debugf("Ignoring status %s-%v-%s", *status.Context, status.UpdatedAt.GetTime(), *status.State) + } + + } + + } + openPRs = len(prs) + + return +} + +// GetPrMetrics counts the number of pending checks that are older than 20 minutes +func GetPrMetrics(mainGhClientCache *lru.Cache[string, GhClientPair]) { + ctx := context.Background() // TODO!!!! + + for _, ghOwner := range mainGhClientCache.Keys() { + log.Debugf("Checking gh Owner %s", ghOwner) + ghClient, _ := mainGhClientCache.Get(ghOwner) + repos, resp, err := ghClient.v3Client.Apps.ListRepos(ctx, nil) + _ = prom.InstrumentGhCall(resp) + if err != nil { + log.Errorf("error getting repos for %s: %v", ghOwner, err) + continue + } + for _, repo := range repos.Repositories { + stalePendingChecks, openPrs, promotionPrs, _ := getRepoPrMetrics(ctx, ghClient, repo) + prom.PublishPrMetrics(openPrs, promotionPrs, stalePendingChecks, repo.GetFullName()) + } + + } +} diff --git a/internal/pkg/prometheus/prometheus.go b/internal/pkg/prometheus/prometheus.go index 6056d49..ebf4252 100644 --- a/internal/pkg/prometheus/prometheus.go +++ b/internal/pkg/prometheus/prometheus.go @@ -39,6 +39,34 @@ var ( Subsystem: "github", }, []string{"api_group", "api_path", "repo_slug", "status", "method"}) + ghOpenPrsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "open_prs", + Help: "The total number of open PRs", + Namespace: "telefonistka", + Subsystem: "github", + }, []string{"repo_slug"}) + + ghOpenPromotionPrsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "open_promotion_prs", + Help: "The total number of open PRs with promotion label", + Namespace: "telefonistka", + Subsystem: "github", + }, []string{"repo_slug"}) + + ghOpenPrsWithPendingCheckGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "open_prs_with_pending_telefonistka_checks", + Help: "The total number of open PRs with pending Telefonistka checks(excluding PRs with very recent commits)", + Namespace: "telefonistka", + Subsystem: "github", + }, []string{"repo_slug"}) + + prHandleFailuresCounter = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "pr_handle_failures_total", + Help: "The total number of PR handling failures", + Namespace: "telefonistka", + Subsystem: "core", + }, []string{"repo_slug"}) + whUpstreamRequestsCountVec = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "upstream_requests_total", Help: "The total number of requests forwarded upstream servers", @@ -47,6 +75,19 @@ var ( }, []string{"status", "method", "url"}) ) +func IncPrHandleFailuresCounter(repoSlug string) { + prHandleFailuresCounter.With(prometheus.Labels{"repo_slug": repoSlug}).Inc() +} + +func PublishPrMetrics(openPrs int, openPromPRs int, openPrsWithPendingChecks int, repoSlug string) { + metricLables := prometheus.Labels{ + "repo_slug": repoSlug, + } + ghOpenPrsGauge.With(metricLables).Set(float64(openPrs)) + ghOpenPromotionPrsGauge.With(metricLables).Set(float64(openPromPRs)) + ghOpenPrsWithPendingCheckGauge.With(metricLables).Set(float64(openPrsWithPendingChecks)) +} + // This function instrument Webhook hits and parsing of their content func InstrumentWebhookHit(parsing_status string) { webhookHitsVec.With(prometheus.Labels{"parsing": parsing_status}).Inc() From 336f921259346d928b12f800de05fbe98907a346 Mon Sep 17 00:00:00 2001 From: Oded Ben Ozer Date: Fri, 29 Nov 2024 13:56:01 +0100 Subject: [PATCH 02/16] lint fixes --- cmd/telefonistka/server.go | 2 +- internal/pkg/githubapi/pr_metrics.go | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/cmd/telefonistka/server.go b/cmd/telefonistka/server.go index 005aa7e..eb69a40 100644 --- a/cmd/telefonistka/server.go +++ b/cmd/telefonistka/server.go @@ -61,7 +61,7 @@ func serve() { for { log.Debugf("Stale check runner") githubapi.GetPrMetrics(mainGhClientCache) - time.Sleep(60 * time.Second) //TODO: make this configurable? GH API rate limits are a facor here + time.Sleep(60 * time.Second) // TODO: make this configurable? GH API rate limits are a facor here } }() diff --git a/internal/pkg/githubapi/pr_metrics.go b/internal/pkg/githubapi/pr_metrics.go index bc1d6c8..9fa99f0 100644 --- a/internal/pkg/githubapi/pr_metrics.go +++ b/internal/pkg/githubapi/pr_metrics.go @@ -15,7 +15,6 @@ const ( ) func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.Repository) (prWithStakeChecks int, openPRs int, openPromotionPrs int, err error) { - log.Debugf("Checking repo %s", repo.GetName()) ghOwner := repo.GetOwner().GetLogin() prListOpts := &github.PullRequestListOptions{ @@ -57,9 +56,7 @@ func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.R } else { log.Debugf("Ignoring status %s-%v-%s", *status.Context, status.UpdatedAt.GetTime(), *status.State) } - } - } openPRs = len(prs) @@ -83,6 +80,5 @@ func GetPrMetrics(mainGhClientCache *lru.Cache[string, GhClientPair]) { stalePendingChecks, openPrs, promotionPrs, _ := getRepoPrMetrics(ctx, ghClient, repo) prom.PublishPrMetrics(openPrs, promotionPrs, stalePendingChecks, repo.GetFullName()) } - } } From 5d3f24863e0481c1e032e387718d54602f675dd5 Mon Sep 17 00:00:00 2001 From: Oded Ben Ozer Date: Fri, 29 Nov 2024 16:22:09 +0100 Subject: [PATCH 03/16] Add test --- internal/pkg/githubapi/pr_metrics.go | 37 +++++++--- internal/pkg/githubapi/pr_metrics_test.go | 90 +++++++++++++++++++++++ 2 files changed, 115 insertions(+), 12 deletions(-) create mode 100644 internal/pkg/githubapi/pr_metrics_test.go diff --git a/internal/pkg/githubapi/pr_metrics.go b/internal/pkg/githubapi/pr_metrics.go index 9fa99f0..b932433 100644 --- a/internal/pkg/githubapi/pr_metrics.go +++ b/internal/pkg/githubapi/pr_metrics.go @@ -22,7 +22,7 @@ func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.R } prs := []*github.PullRequest{} - // paginate through PRs, there might be lot of them. + // paginate through PRs, there might be lots of them. for { perPagePrs, resp, err := ghClient.v3Client.PullRequests.List(ctx, ghOwner, repo.GetName(), prListOpts) _ = prom.InstrumentGhCall(resp) @@ -40,6 +40,7 @@ func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.R if DoesPrHasLabel(pr.Labels, "promotion") { openPromotionPrs++ } + log.Debugf("Checking PR %d", pr.GetNumber()) commitStatuses, resp, err := ghClient.v3Client.Repositories.GetCombinedStatus(ctx, ghOwner, repo.GetName(), pr.GetHead().GetSHA(), nil) _ = prom.InstrumentGhCall(resp) @@ -47,15 +48,8 @@ func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.R log.Errorf("error getting statuses for %s/%s/%d: %v", ghOwner, repo.GetName(), pr.GetNumber(), err) continue } - for _, status := range commitStatuses.Statuses { - if *status.Context == "telefonistka" && - *status.State == "pending" && - status.UpdatedAt.GetTime().Before(time.Now().Add(-time.Minute*minutesToDfineStale)) { - log.Debugf("Adding status %s-%v-%s !!!", *status.Context, status.UpdatedAt.GetTime(), *status.State) - prWithStakeChecks++ - } else { - log.Debugf("Ignoring status %s-%v-%s", *status.Context, status.UpdatedAt.GetTime(), *status.State) - } + if isPrStalePending(commitStatuses, minutesToDfineStale) { + prWithStakeChecks++ } } openPRs = len(prs) @@ -63,14 +57,33 @@ func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.R return } -// GetPrMetrics counts the number of pending checks that are older than 20 minutes +// isPrStalePending checks if the a combinedStatus has a "telefonistka" context pending status that is older than minutesToDfineStale and is in pending state +func isPrStalePending(commitStatuses *github.CombinedStatus, minutesToDfineStale int) bool { + + staleDuration := time.Duration(minutesToDfineStale) * time.Minute * -1 + + for _, status := range commitStatuses.Statuses { + if *status.Context == "telefonistka" && + *status.State == "pending" && + status.UpdatedAt.GetTime().Before(time.Now().Add(staleDuration)) { + log.Debugf("Adding status %s-%v-%s !!!", *status.Context, status.UpdatedAt.GetTime(), *status.State) + return true + } else { + log.Debugf("Ignoring status %s-%v-%s", *status.Context, status.UpdatedAt.GetTime(), *status.State) + } + } + + return false +} + +// GetPrMetrics itterates through all clients , gets all repos and then all PRs and calculates metrics func GetPrMetrics(mainGhClientCache *lru.Cache[string, GhClientPair]) { ctx := context.Background() // TODO!!!! for _, ghOwner := range mainGhClientCache.Keys() { log.Debugf("Checking gh Owner %s", ghOwner) ghClient, _ := mainGhClientCache.Get(ghOwner) - repos, resp, err := ghClient.v3Client.Apps.ListRepos(ctx, nil) + repos, resp, err := ghClient.v3Client.Apps.ListRepos(ctx, nil) // TODO what if you are not an app? _ = prom.InstrumentGhCall(resp) if err != nil { log.Errorf("error getting repos for %s: %v", ghOwner, err) diff --git a/internal/pkg/githubapi/pr_metrics_test.go b/internal/pkg/githubapi/pr_metrics_test.go new file mode 100644 index 0000000..db1bf79 --- /dev/null +++ b/internal/pkg/githubapi/pr_metrics_test.go @@ -0,0 +1,90 @@ +package githubapi + +// @Title +// @Description +// @Author +// @Update +import ( + "testing" + "time" + + "github.com/google/go-github/v62/github" +) + +func TestIsPrStalePending(t *testing.T) { + minutesToDfineStale := 15 + + currentTime := time.Now() + tests := map[string]struct { + input github.CombinedStatus + result bool + }{ + "All Success": { + input: github.CombinedStatus{ + Statuses: []*github.RepoStatus{ + { + State: github.String("success"), + Context: github.String("telefonistka"), + UpdatedAt: &github.Timestamp{ + Time: currentTime.Add(-10 * time.Minute), + }, + }, + { + State: github.String("success"), + Context: github.String("circleci"), + UpdatedAt: &github.Timestamp{ + Time: currentTime.Add(-10 * time.Minute), + }, + }, + { + State: github.String("success"), + Context: github.String("foobar"), + UpdatedAt: &github.Timestamp{ + Time: currentTime.Add(-10 * time.Minute), + }, + }, + }, + }, + result: false, + }, + "Pending but not stale": { + input: github.CombinedStatus{ + Statuses: []*github.RepoStatus{ + { + State: github.String("pending"), + Context: github.String("telefonistka"), + UpdatedAt: &github.Timestamp{ + Time: currentTime.Add(-1 * time.Minute), + }, + }, + }, + }, + result: false, + }, + + "Pending and stale": { + input: github.CombinedStatus{ + Statuses: []*github.RepoStatus{ + { + State: github.String("pending"), + Context: github.String("telefonistka"), + UpdatedAt: &github.Timestamp{ + Time: currentTime.Add(-20 * time.Minute), + }, + }, + }, + }, + result: true, + }, + } + + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + result := isPrStalePending(&tc.input, minutesToDfineStale) + if result != tc.result { + t.Errorf("(%s)Expected %v, got %v", name, tc.result, result) + } + }) + } + +} From e6aa7f46372f36695cce8d04d01517c52c03afbc Mon Sep 17 00:00:00 2001 From: Oded Ben Ozer Date: Fri, 29 Nov 2024 16:32:23 +0100 Subject: [PATCH 04/16] Fix typo --- internal/pkg/githubapi/pr_metrics.go | 10 +++++----- internal/pkg/githubapi/pr_metrics_test.go | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/internal/pkg/githubapi/pr_metrics.go b/internal/pkg/githubapi/pr_metrics.go index b932433..22c4541 100644 --- a/internal/pkg/githubapi/pr_metrics.go +++ b/internal/pkg/githubapi/pr_metrics.go @@ -11,7 +11,7 @@ import ( ) const ( - minutesToDfineStale = 20 + minutesToDefineStale = 20 ) func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.Repository) (prWithStakeChecks int, openPRs int, openPromotionPrs int, err error) { @@ -48,7 +48,7 @@ func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.R log.Errorf("error getting statuses for %s/%s/%d: %v", ghOwner, repo.GetName(), pr.GetNumber(), err) continue } - if isPrStalePending(commitStatuses, minutesToDfineStale) { + if isPrStalePending(commitStatuses, minutesToDefineStale) { prWithStakeChecks++ } } @@ -57,10 +57,10 @@ func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.R return } -// isPrStalePending checks if the a combinedStatus has a "telefonistka" context pending status that is older than minutesToDfineStale and is in pending state -func isPrStalePending(commitStatuses *github.CombinedStatus, minutesToDfineStale int) bool { +// isPrStalePending checks if the a combinedStatus has a "telefonistka" context pending status that is older than minutesToDefineStale and is in pending state +func isPrStalePending(commitStatuses *github.CombinedStatus, minutesToDefineStale int) bool { - staleDuration := time.Duration(minutesToDfineStale) * time.Minute * -1 + staleDuration := time.Duration(minutesToDefineStale) * time.Minute * -1 for _, status := range commitStatuses.Statuses { if *status.Context == "telefonistka" && diff --git a/internal/pkg/githubapi/pr_metrics_test.go b/internal/pkg/githubapi/pr_metrics_test.go index db1bf79..36f13ea 100644 --- a/internal/pkg/githubapi/pr_metrics_test.go +++ b/internal/pkg/githubapi/pr_metrics_test.go @@ -12,7 +12,7 @@ import ( ) func TestIsPrStalePending(t *testing.T) { - minutesToDfineStale := 15 + minutesToDefineStale := 15 currentTime := time.Now() tests := map[string]struct { @@ -80,7 +80,7 @@ func TestIsPrStalePending(t *testing.T) { for name, tc := range tests { t.Run(name, func(t *testing.T) { - result := isPrStalePending(&tc.input, minutesToDfineStale) + result := isPrStalePending(&tc.input, minutesToDefineStale) if result != tc.result { t.Errorf("(%s)Expected %v, got %v", name, tc.result, result) } From 755b603f4c3f72622e5f2109b55df0ec122d16c2 Mon Sep 17 00:00:00 2001 From: Oded Ben Ozer Date: Fri, 29 Nov 2024 16:53:01 +0100 Subject: [PATCH 05/16] Use a more generic approach to monitoring PR handle failures. This way we can also monitor the successes, so we can get a ratio or just the activity rate of the repo and such --- internal/pkg/githubapi/github.go | 3 ++- internal/pkg/githubapi/pr_metrics.go | 1 - internal/pkg/prometheus/prometheus.go | 17 ++++++++++------- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/internal/pkg/githubapi/github.go b/internal/pkg/githubapi/github.go index e6b754a..031bb47 100644 --- a/internal/pkg/githubapi/github.go +++ b/internal/pkg/githubapi/github.go @@ -146,7 +146,6 @@ func HandlePREvent(eventPayload *github.PullRequestEvent, ghPrClientDetails GhPr defer func() { if err != nil { SetCommitStatus(ghPrClientDetails, "error") - prom.IncPrHandleFailuresCounter(ghPrClientDetails.Owner + "/" + ghPrClientDetails.Repo) return } SetCommitStatus(ghPrClientDetails, "success") @@ -877,6 +876,8 @@ func SetCommitStatus(ghPrClientDetails GhPrClientDetails, state string) { _, resp, err := ghPrClientDetails.GhClientPair.v3Client.Repositories.CreateStatus(ctx, ghPrClientDetails.Owner, ghPrClientDetails.Repo, ghPrClientDetails.PrSHA, commitStatus) prom.InstrumentGhCall(resp) + repoSlug := ghPrClientDetails.Owner + "/" + ghPrClientDetails.Repo + prom.IncCommitStatusUpdateCounter(repoSlug, state) if err != nil { ghPrClientDetails.PrLogger.Errorf("Failed to set commit status: err=%s\n%v", err, resp) } diff --git a/internal/pkg/githubapi/pr_metrics.go b/internal/pkg/githubapi/pr_metrics.go index 22c4541..002a8b1 100644 --- a/internal/pkg/githubapi/pr_metrics.go +++ b/internal/pkg/githubapi/pr_metrics.go @@ -59,7 +59,6 @@ func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.R // isPrStalePending checks if the a combinedStatus has a "telefonistka" context pending status that is older than minutesToDefineStale and is in pending state func isPrStalePending(commitStatuses *github.CombinedStatus, minutesToDefineStale int) bool { - staleDuration := time.Duration(minutesToDefineStale) * time.Minute * -1 for _, status := range commitStatuses.Statuses { diff --git a/internal/pkg/prometheus/prometheus.go b/internal/pkg/prometheus/prometheus.go index ebf4252..549cd8f 100644 --- a/internal/pkg/prometheus/prometheus.go +++ b/internal/pkg/prometheus/prometheus.go @@ -60,12 +60,12 @@ var ( Subsystem: "github", }, []string{"repo_slug"}) - prHandleFailuresCounter = promauto.NewCounterVec(prometheus.CounterOpts{ - Name: "pr_handle_failures_total", - Help: "The total number of PR handling failures", + commitStatusUpdates = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "commit_status_updates_total", + Help: "The total number of commit status updates, and their status (success/pending/failure)", Namespace: "telefonistka", - Subsystem: "core", - }, []string{"repo_slug"}) + Subsystem: "github", + }, []string{"repo_slug", "status"}) whUpstreamRequestsCountVec = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "upstream_requests_total", @@ -75,8 +75,11 @@ var ( }, []string{"status", "method", "url"}) ) -func IncPrHandleFailuresCounter(repoSlug string) { - prHandleFailuresCounter.With(prometheus.Labels{"repo_slug": repoSlug}).Inc() +func IncCommitStatusUpdateCounter(repoSlug string, status string) { + commitStatusUpdates.With(prometheus.Labels{ + "repo_slug": repoSlug, + "status": status, + }).Inc() } func PublishPrMetrics(openPrs int, openPromPRs int, openPrsWithPendingChecks int, repoSlug string) { From 2c29af9a346d255680765d26d9f7d85a7409e454 Mon Sep 17 00:00:00 2001 From: Oded Ben Ozer Date: Fri, 29 Nov 2024 17:05:07 +0100 Subject: [PATCH 06/16] Document new metrics --- docs/observability.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/observability.md b/docs/observability.md index 03ed81d..c0efcf0 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -6,6 +6,10 @@ |telefonistka_github_github_rest_api_client_rate_remaining|gauge|The number of remaining requests the client can make this hour|| |telefonistka_github_github_rest_api_client_rate_limit|gauge|The number of requests per hour the client is currently limited to|| |telefonistka_webhook_server_webhook_hits_total|counter|The total number of validated webhook hits|`parsing`| +|telefonistka_github_open_prs|gauge|The number of open PRs|`repo_slug`| +|telefonistka_github_open_promotion_prs|gauge|The number of open promotion PRs|`repo_slug`| +|telefonistka_github_open_prs_with_pending_telefonistka_checks|gauge|The number of open PRs with pending Telefonistka checks(excluding PRs with very recent commits)|`repo_slug`| +|telefonistka_github_commit_status_updates_total|counter|The total number of commit status updates, and their status (success/pending/failure)|`repo_slug`, `status`| Example metrics snippet: From 84399abc5a4471e6560666686b6025f5582c2583 Mon Sep 17 00:00:00 2001 From: Oded Ben Ozer Date: Wed, 4 Dec 2024 11:57:19 +0100 Subject: [PATCH 07/16] Move metrics loop to metrics file. Express const timeframes in time (not int) --- cmd/telefonistka/server.go | 8 +------- internal/pkg/githubapi/pr_metrics.go | 24 +++++++++++++++--------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/cmd/telefonistka/server.go b/cmd/telefonistka/server.go index eb69a40..0477a42 100644 --- a/cmd/telefonistka/server.go +++ b/cmd/telefonistka/server.go @@ -57,13 +57,7 @@ func serve() { mainGhClientCache, _ := lru.New[string, githubapi.GhClientPair](128) prApproverGhClientCache, _ := lru.New[string, githubapi.GhClientPair](128) - go func() { - for { - log.Debugf("Stale check runner") - githubapi.GetPrMetrics(mainGhClientCache) - time.Sleep(60 * time.Second) // TODO: make this configurable? GH API rate limits are a facor here - } - }() + go githubapi.MainGhMetricsLoop(mainGhClientCache) mux := http.NewServeMux() mux.HandleFunc("/webhook", handleWebhook(githubWebhookSecret, mainGhClientCache, prApproverGhClientCache)) diff --git a/internal/pkg/githubapi/pr_metrics.go b/internal/pkg/githubapi/pr_metrics.go index 002a8b1..854c543 100644 --- a/internal/pkg/githubapi/pr_metrics.go +++ b/internal/pkg/githubapi/pr_metrics.go @@ -11,9 +11,17 @@ import ( ) const ( - minutesToDefineStale = 20 + timeToDefineStale = 20 * time.Minute + metricRefreshTime = 60 * time.Second // TODO: make this configurable? GH API rate limits are a factor here ) +func MainGhMetricsLoop(mainGhClientCache *lru.Cache[string, GhClientPair]) { + for { + getPrMetrics(mainGhClientCache) + time.Sleep(metricRefreshTime) + } +} + func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.Repository) (prWithStakeChecks int, openPRs int, openPromotionPrs int, err error) { log.Debugf("Checking repo %s", repo.GetName()) ghOwner := repo.GetOwner().GetLogin() @@ -48,7 +56,7 @@ func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.R log.Errorf("error getting statuses for %s/%s/%d: %v", ghOwner, repo.GetName(), pr.GetNumber(), err) continue } - if isPrStalePending(commitStatuses, minutesToDefineStale) { + if isPrStalePending(commitStatuses, timeToDefineStale) { prWithStakeChecks++ } } @@ -57,14 +65,12 @@ func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.R return } -// isPrStalePending checks if the a combinedStatus has a "telefonistka" context pending status that is older than minutesToDefineStale and is in pending state -func isPrStalePending(commitStatuses *github.CombinedStatus, minutesToDefineStale int) bool { - staleDuration := time.Duration(minutesToDefineStale) * time.Minute * -1 - +// isPrStalePending checks if the a combinedStatus has a "telefonistka" context pending status that is older than timeToDefineStale and is in pending state +func isPrStalePending(commitStatuses *github.CombinedStatus, timeToDefineStale time.Duration) bool { for _, status := range commitStatuses.Statuses { if *status.Context == "telefonistka" && *status.State == "pending" && - status.UpdatedAt.GetTime().Before(time.Now().Add(staleDuration)) { + status.UpdatedAt.GetTime().Before(time.Now().Add(timeToDefineStale*-1)) { log.Debugf("Adding status %s-%v-%s !!!", *status.Context, status.UpdatedAt.GetTime(), *status.State) return true } else { @@ -75,8 +81,8 @@ func isPrStalePending(commitStatuses *github.CombinedStatus, minutesToDefineStal return false } -// GetPrMetrics itterates through all clients , gets all repos and then all PRs and calculates metrics -func GetPrMetrics(mainGhClientCache *lru.Cache[string, GhClientPair]) { +// getPrMetrics itterates through all clients , gets all repos and then all PRs and calculates metrics +func getPrMetrics(mainGhClientCache *lru.Cache[string, GhClientPair]) { ctx := context.Background() // TODO!!!! for _, ghOwner := range mainGhClientCache.Keys() { From 9017b8758d78f72771d445bc88ee4358de42cfa0 Mon Sep 17 00:00:00 2001 From: Oded Ben Ozer Date: Wed, 4 Dec 2024 12:04:49 +0100 Subject: [PATCH 08/16] Fix type issue --- internal/pkg/githubapi/pr_metrics_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/pkg/githubapi/pr_metrics_test.go b/internal/pkg/githubapi/pr_metrics_test.go index 36f13ea..83a63ba 100644 --- a/internal/pkg/githubapi/pr_metrics_test.go +++ b/internal/pkg/githubapi/pr_metrics_test.go @@ -12,7 +12,7 @@ import ( ) func TestIsPrStalePending(t *testing.T) { - minutesToDefineStale := 15 + timeToDefineStale := 15 * time.Minute currentTime := time.Now() tests := map[string]struct { @@ -80,7 +80,7 @@ func TestIsPrStalePending(t *testing.T) { for name, tc := range tests { t.Run(name, func(t *testing.T) { - result := isPrStalePending(&tc.input, minutesToDefineStale) + result := isPrStalePending(&tc.input, timeToDefineStale) if result != tc.result { t.Errorf("(%s)Expected %v, got %v", name, tc.result, result) } From 3ed74c642e5dec08b1b35eae059066fad19d313f Mon Sep 17 00:00:00 2001 From: Oded Ben Ozer Date: Wed, 4 Dec 2024 13:23:47 +0100 Subject: [PATCH 09/16] Fix lint issue --- internal/pkg/githubapi/pr_metrics_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/githubapi/pr_metrics_test.go b/internal/pkg/githubapi/pr_metrics_test.go index 83a63ba..30b80fe 100644 --- a/internal/pkg/githubapi/pr_metrics_test.go +++ b/internal/pkg/githubapi/pr_metrics_test.go @@ -12,6 +12,7 @@ import ( ) func TestIsPrStalePending(t *testing.T) { + t.Parallel() timeToDefineStale := 15 * time.Minute currentTime := time.Now() @@ -86,5 +87,4 @@ func TestIsPrStalePending(t *testing.T) { } }) } - } From 0682da7577d6597adf82d4806c2578c36a090f50 Mon Sep 17 00:00:00 2001 From: Oded Ben Ozer Date: Wed, 4 Dec 2024 13:33:14 +0100 Subject: [PATCH 10/16] Lint stuff --- internal/pkg/githubapi/pr_metrics_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/internal/pkg/githubapi/pr_metrics_test.go b/internal/pkg/githubapi/pr_metrics_test.go index 30b80fe..5018c45 100644 --- a/internal/pkg/githubapi/pr_metrics_test.go +++ b/internal/pkg/githubapi/pr_metrics_test.go @@ -80,7 +80,10 @@ func TestIsPrStalePending(t *testing.T) { } for name, tc := range tests { + name := name + tc := tc t.Run(name, func(t *testing.T) { + t.Parallel() result := isPrStalePending(&tc.input, timeToDefineStale) if result != tc.result { t.Errorf("(%s)Expected %v, got %v", name, tc.result, result) From 6f5801c9af8d8fac54cac608ef8ef14913b061ea Mon Sep 17 00:00:00 2001 From: Oded Ben Ozer Date: Wed, 4 Dec 2024 13:57:18 +0100 Subject: [PATCH 11/16] Document dep' on GH apps improve error handeling Add context timeout --- docs/observability.md | 3 +++ internal/pkg/githubapi/pr_metrics.go | 16 +++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/docs/observability.md b/docs/observability.md index c0efcf0..11f54ed 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -11,6 +11,9 @@ |telefonistka_github_open_prs_with_pending_telefonistka_checks|gauge|The number of open PRs with pending Telefonistka checks(excluding PRs with very recent commits)|`repo_slug`| |telefonistka_github_commit_status_updates_total|counter|The total number of commit status updates, and their status (success/pending/failure)|`repo_slug`, `status`| +> [!NOTE] +> telefonistka_github_*_prs metrics are only supported on installtions that uses GitHub App authentication as it provides an easy way to query the relevant GH repos. + Example metrics snippet: ```text diff --git a/internal/pkg/githubapi/pr_metrics.go b/internal/pkg/githubapi/pr_metrics.go index 854c543..d5a3e65 100644 --- a/internal/pkg/githubapi/pr_metrics.go +++ b/internal/pkg/githubapi/pr_metrics.go @@ -12,7 +12,7 @@ import ( const ( timeToDefineStale = 20 * time.Minute - metricRefreshTime = 60 * time.Second // TODO: make this configurable? GH API rate limits are a factor here + metricRefreshTime = 60 * time.Second ) func MainGhMetricsLoop(mainGhClientCache *lru.Cache[string, GhClientPair]) { @@ -82,20 +82,26 @@ func isPrStalePending(commitStatuses *github.CombinedStatus, timeToDefineStale t } // getPrMetrics itterates through all clients , gets all repos and then all PRs and calculates metrics +// This assumes Telefonsitka uses a GitHub App style of authentication as it uses the Apps.ListRepos call +// When using personal access token authentication, Telefonistka is unaware of the "relevant" repos (at least it get a webhook from them). func getPrMetrics(mainGhClientCache *lru.Cache[string, GhClientPair]) { - ctx := context.Background() // TODO!!!! - + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() for _, ghOwner := range mainGhClientCache.Keys() { log.Debugf("Checking gh Owner %s", ghOwner) ghClient, _ := mainGhClientCache.Get(ghOwner) - repos, resp, err := ghClient.v3Client.Apps.ListRepos(ctx, nil) // TODO what if you are not an app? + repos, resp, err := ghClient.v3Client.Apps.ListRepos(ctx, nil) _ = prom.InstrumentGhCall(resp) if err != nil { log.Errorf("error getting repos for %s: %v", ghOwner, err) continue } for _, repo := range repos.Repositories { - stalePendingChecks, openPrs, promotionPrs, _ := getRepoPrMetrics(ctx, ghClient, repo) + stalePendingChecks, openPrs, promotionPrs, err := getRepoPrMetrics(ctx, ghClient, repo) + if err != nil { + log.Errorf("error getting repos for %s: %v", ghOwner, err) + continue + } prom.PublishPrMetrics(openPrs, promotionPrs, stalePendingChecks, repo.GetFullName()) } } From 662d47c7accaa186c093db8a25ebb1b48d89d50b Mon Sep 17 00:00:00 2001 From: Oded Ben Ozer Date: Mon, 9 Dec 2024 10:57:53 +0100 Subject: [PATCH 12/16] Switch sleep for Ticker --- internal/pkg/githubapi/pr_metrics.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/pkg/githubapi/pr_metrics.go b/internal/pkg/githubapi/pr_metrics.go index d5a3e65..e91fa3b 100644 --- a/internal/pkg/githubapi/pr_metrics.go +++ b/internal/pkg/githubapi/pr_metrics.go @@ -16,9 +16,9 @@ const ( ) func MainGhMetricsLoop(mainGhClientCache *lru.Cache[string, GhClientPair]) { - for { + for t := range time.Tick(metricRefreshTime) { + log.Debugf("Updating pr metrics at %v", t) getPrMetrics(mainGhClientCache) - time.Sleep(metricRefreshTime) } } From be4d9748c2f82a53b9b49d775d70cf8a83a03ec4 Mon Sep 17 00:00:00 2001 From: Oded Ben Ozer Date: Tue, 10 Dec 2024 09:21:54 +0100 Subject: [PATCH 13/16] typo fix --- internal/pkg/githubapi/pr_metrics.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/pkg/githubapi/pr_metrics.go b/internal/pkg/githubapi/pr_metrics.go index e91fa3b..ac8d023 100644 --- a/internal/pkg/githubapi/pr_metrics.go +++ b/internal/pkg/githubapi/pr_metrics.go @@ -22,7 +22,7 @@ func MainGhMetricsLoop(mainGhClientCache *lru.Cache[string, GhClientPair]) { } } -func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.Repository) (prWithStakeChecks int, openPRs int, openPromotionPrs int, err error) { +func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.Repository) (prWithStaleChecks int, openPRs int, openPromotionPrs int, err error) { log.Debugf("Checking repo %s", repo.GetName()) ghOwner := repo.GetOwner().GetLogin() prListOpts := &github.PullRequestListOptions{ @@ -57,7 +57,7 @@ func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.R continue } if isPrStalePending(commitStatuses, timeToDefineStale) { - prWithStakeChecks++ + prWithStaleChecks++ } } openPRs = len(prs) From 78f43e7fdffeacfb89731d0ddc6caabe0d2abef7 Mon Sep 17 00:00:00 2001 From: Oded Ben Ozer Date: Thu, 12 Dec 2024 09:06:16 +0100 Subject: [PATCH 14/16] Apply suggestions from code review Co-authored-by: Yazdan Mohammadi --- internal/pkg/githubapi/pr_metrics.go | 4 ++-- internal/pkg/githubapi/pr_metrics_test.go | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/internal/pkg/githubapi/pr_metrics.go b/internal/pkg/githubapi/pr_metrics.go index ac8d023..0e734a4 100644 --- a/internal/pkg/githubapi/pr_metrics.go +++ b/internal/pkg/githubapi/pr_metrics.go @@ -81,8 +81,8 @@ func isPrStalePending(commitStatuses *github.CombinedStatus, timeToDefineStale t return false } -// getPrMetrics itterates through all clients , gets all repos and then all PRs and calculates metrics -// This assumes Telefonsitka uses a GitHub App style of authentication as it uses the Apps.ListRepos call +// getPrMetrics iterates through all clients , gets all repos and then all PRs and calculates metrics +// getPrMetrics assumes Telefonsitka uses a GitHub App style of authentication as it uses the Apps.ListRepos call // When using personal access token authentication, Telefonistka is unaware of the "relevant" repos (at least it get a webhook from them). func getPrMetrics(mainGhClientCache *lru.Cache[string, GhClientPair]) { ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) diff --git a/internal/pkg/githubapi/pr_metrics_test.go b/internal/pkg/githubapi/pr_metrics_test.go index 5018c45..f1a90d8 100644 --- a/internal/pkg/githubapi/pr_metrics_test.go +++ b/internal/pkg/githubapi/pr_metrics_test.go @@ -1,9 +1,5 @@ package githubapi -// @Title -// @Description -// @Author -// @Update import ( "testing" "time" From 4cc9584c9952591bb163dc2a413570e32dc3ca84 Mon Sep 17 00:00:00 2001 From: Oded Ben Ozer Date: Thu, 12 Dec 2024 09:49:17 +0100 Subject: [PATCH 15/16] Use a struct to pass data between getRepoPrMetrics and PublishPrMetrics --- internal/pkg/githubapi/pr_metrics.go | 12 ++++++------ internal/pkg/prometheus/prometheus.go | 14 ++++++++++---- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/internal/pkg/githubapi/pr_metrics.go b/internal/pkg/githubapi/pr_metrics.go index 0e734a4..4b91952 100644 --- a/internal/pkg/githubapi/pr_metrics.go +++ b/internal/pkg/githubapi/pr_metrics.go @@ -22,7 +22,7 @@ func MainGhMetricsLoop(mainGhClientCache *lru.Cache[string, GhClientPair]) { } } -func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.Repository) (prWithStaleChecks int, openPRs int, openPromotionPrs int, err error) { +func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.Repository) (pc prom.PrCounters, err error) { log.Debugf("Checking repo %s", repo.GetName()) ghOwner := repo.GetOwner().GetLogin() prListOpts := &github.PullRequestListOptions{ @@ -46,7 +46,7 @@ func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.R for _, pr := range prs { if DoesPrHasLabel(pr.Labels, "promotion") { - openPromotionPrs++ + pc.OpenPromotionPrs++ } log.Debugf("Checking PR %d", pr.GetNumber()) @@ -57,10 +57,10 @@ func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.R continue } if isPrStalePending(commitStatuses, timeToDefineStale) { - prWithStaleChecks++ + pc.PrWithStaleChecks++ } } - openPRs = len(prs) + pc.OpenPrs = len(prs) return } @@ -97,12 +97,12 @@ func getPrMetrics(mainGhClientCache *lru.Cache[string, GhClientPair]) { continue } for _, repo := range repos.Repositories { - stalePendingChecks, openPrs, promotionPrs, err := getRepoPrMetrics(ctx, ghClient, repo) + pc, err := getRepoPrMetrics(ctx, ghClient, repo) if err != nil { log.Errorf("error getting repos for %s: %v", ghOwner, err) continue } - prom.PublishPrMetrics(openPrs, promotionPrs, stalePendingChecks, repo.GetFullName()) + prom.PublishPrMetrics(pc, repo.GetFullName()) } } } diff --git a/internal/pkg/prometheus/prometheus.go b/internal/pkg/prometheus/prometheus.go index 549cd8f..be3e4ab 100644 --- a/internal/pkg/prometheus/prometheus.go +++ b/internal/pkg/prometheus/prometheus.go @@ -10,6 +10,12 @@ import ( "github.com/prometheus/client_golang/prometheus/promauto" ) +type PrCounters struct { + OpenPrs int + OpenPromotionPrs int + PrWithStaleChecks int +} + var ( webhookHitsVec = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "webhook_hits_total", @@ -82,13 +88,13 @@ func IncCommitStatusUpdateCounter(repoSlug string, status string) { }).Inc() } -func PublishPrMetrics(openPrs int, openPromPRs int, openPrsWithPendingChecks int, repoSlug string) { +func PublishPrMetrics(pc PrCounters, repoSlug string) { metricLables := prometheus.Labels{ "repo_slug": repoSlug, } - ghOpenPrsGauge.With(metricLables).Set(float64(openPrs)) - ghOpenPromotionPrsGauge.With(metricLables).Set(float64(openPromPRs)) - ghOpenPrsWithPendingCheckGauge.With(metricLables).Set(float64(openPrsWithPendingChecks)) + ghOpenPrsGauge.With(metricLables).Set(float64(pc.OpenPrs)) + ghOpenPromotionPrsGauge.With(metricLables).Set(float64(pc.OpenPromotionPrs)) + ghOpenPrsWithPendingCheckGauge.With(metricLables).Set(float64(pc.PrWithStaleChecks)) } // This function instrument Webhook hits and parsing of their content From e51d579624821dc420e2dd727d967bcd933c13a7 Mon Sep 17 00:00:00 2001 From: Oded Ben Ozer Date: Thu, 12 Dec 2024 11:23:58 +0100 Subject: [PATCH 16/16] Add new metrics to example --- docs/observability.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/observability.md b/docs/observability.md index 11f54ed..9304b6a 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -33,4 +33,20 @@ telefonistka_github_github_rest_api_client_rate_remaining 99668 # HELP telefonistka_webhook_server_webhook_hits_total The total number of validated webhook hits # TYPE telefonistka_webhook_server_webhook_hits_total counter telefonistka_webhook_server_webhook_hits_total{parsing="successful"} 8 +# HELP telefonistka_github_commit_status_updates_total The total number of commit status updates, and their status (success/pending/failure) +# TYPE telefonistka_github_commit_status_updates_total counter +telefonistka_github_commit_status_updates_total{repo_slug="foo/bar2",status="error"} 1 +telefonistka_github_commit_status_updates_total{repo_slug="foo/bar2",status="pending"} 1 +# HELP telefonistka_github_open_promotion_prs The total number of open PRs with promotion label +# TYPE telefonistka_github_open_promotion_prs gauge +telefonistka_github_open_promotion_prs{repo_slug="foo/bar1"} 0 +telefonistka_github_open_promotion_prs{repo_slug="foo/bar2"} 10 +# HELP telefonistka_github_open_prs The total number of open PRs +# TYPE telefonistka_github_open_prs gauge +telefonistka_github_open_prs{repo_slug="foo/bar1"} 0 +telefonistka_github_open_prs{repo_slug="foo/bar2"} 21 +# HELP telefonistka_github_open_prs_with_pending_telefonistka_checks The total number of open PRs with pending Telefonistka checks(excluding PRs with very recent commits) +# TYPE telefonistka_github_open_prs_with_pending_telefonistka_checks gauge +telefonistka_github_open_prs_with_pending_telefonistka_checks{repo_slug="foo/bar1"} 0 +telefonistka_github_open_prs_with_pending_telefonistka_checks{repo_slug="foo/bar2"} 0 ```