[proxy] read cert from disk if available instead of cert manager (#5574)

* **New Features**
  * Asynchronous certificate prefetch that races live issuance with periodic on-disk cache checks to surface certificates faster.
  * Centralized recording and notification when certificates become available.
  * New on-disk certificate reading and validation to allow immediate use of cached certs.

* **Bug Fixes & Performance**
  * Optimized retrieval by polling disk while fetching in background to reduce latency.
  * Added cancellation and timeout handling to fail stalled certificate operations reliably.
This commit is contained in:
Pascal Fischer
2026-03-11 19:18:37 +01:00
committed by GitHub
parent b5489d4986
commit d3d6a327e0

View File

@@ -7,9 +7,12 @@ import (
"encoding/asn1"
"encoding/base64"
"encoding/binary"
"encoding/pem"
"fmt"
"math/rand/v2"
"net"
"slices"
"strings"
"sync"
"time"
@@ -137,7 +140,12 @@ func (mgr *Manager) AddDomain(d domain.Domain, accountID, serviceID string) {
// It acquires a distributed lock to prevent multiple replicas from issuing
// duplicate ACME requests. The second replica will block until the first
// finishes, then find the certificate in the cache.
// ACME and periodic disk reads race; whichever produces a valid certificate
// first wins. This handles cases where locking is unreliable and another
// replica already wrote the cert to the shared cache.
func (mgr *Manager) prefetchCertificate(d domain.Domain) {
time.Sleep(time.Duration(rand.IntN(200)) * time.Millisecond)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
@@ -153,26 +161,105 @@ func (mgr *Manager) prefetchCertificate(d domain.Domain) {
defer unlock()
}
hello := &tls.ClientHelloInfo{
ServerName: name,
Conn: &dummyConn{ctx: ctx},
}
start := time.Now()
cert, err := mgr.GetCertificate(hello)
elapsed := time.Since(start)
if err != nil {
mgr.logger.Warnf("prefetch certificate for domain %q in %s: %v", name, elapsed.String(), err)
mgr.setDomainState(d, domainFailed, err.Error())
if cert, err := mgr.readCertFromDisk(ctx, name); err == nil {
mgr.logger.Infof("certificate for domain %q already on disk, skipping ACME", name)
mgr.recordAndNotify(ctx, d, name, cert, 0)
return
}
if mgr.metrics != nil {
// Run ACME in a goroutine so we can race it against periodic disk reads.
// autocert uses its own internal context and cannot be cancelled externally.
type acmeResult struct {
cert *tls.Certificate
err error
}
acmeCh := make(chan acmeResult, 1)
hello := &tls.ClientHelloInfo{ServerName: name, Conn: &dummyConn{ctx: ctx}}
go func() {
cert, err := mgr.GetCertificate(hello)
acmeCh <- acmeResult{cert, err}
}()
start := time.Now()
diskTicker := time.NewTicker(5 * time.Second)
defer diskTicker.Stop()
for {
select {
case res := <-acmeCh:
elapsed := time.Since(start)
if res.err != nil {
mgr.logger.Warnf("prefetch certificate for domain %q in %s: %v", name, elapsed.String(), res.err)
mgr.setDomainState(d, domainFailed, res.err.Error())
return
}
mgr.recordAndNotify(ctx, d, name, res.cert, elapsed)
return
case <-diskTicker.C:
cert, err := mgr.readCertFromDisk(context.Background(), name)
if err != nil {
continue
}
mgr.logger.Infof("certificate for domain %q appeared on disk after %s", name, time.Since(start).Round(time.Millisecond))
// Drain the ACME goroutine before marking ready — autocert holds
// an internal write lock on certState while ACME is in flight.
go func() {
select {
case <-acmeCh:
default:
}
mgr.recordAndNotify(context.Background(), d, name, cert, 0)
}()
return
case <-ctx.Done():
mgr.logger.Warnf("prefetch certificate for domain %q timed out", name)
mgr.setDomainState(d, domainFailed, ctx.Err().Error())
return
}
}
}
// readCertFromDisk reads and parses a certificate directly from the autocert
// DirCache, bypassing autocert's internal certState mutex. Safe to call
// concurrently with an in-flight ACME request for the same domain.
func (mgr *Manager) readCertFromDisk(ctx context.Context, name string) (*tls.Certificate, error) {
if mgr.Cache == nil {
return nil, fmt.Errorf("no cache configured")
}
data, err := mgr.Cache.Get(ctx, name)
if err != nil {
return nil, err
}
privBlock, certsPEM := pem.Decode(data)
if privBlock == nil || !strings.Contains(privBlock.Type, "PRIVATE") {
return nil, fmt.Errorf("no private key in cache for %q", name)
}
cert, err := tls.X509KeyPair(certsPEM, pem.EncodeToMemory(privBlock))
if err != nil {
return nil, fmt.Errorf("parse cached certificate for %q: %w", name, err)
}
if len(cert.Certificate) > 0 {
leaf, err := x509.ParseCertificate(cert.Certificate[0])
if err != nil {
return nil, fmt.Errorf("parse leaf for %q: %w", name, err)
}
if time.Now().After(leaf.NotAfter) {
return nil, fmt.Errorf("cached certificate for %q expired at %s", name, leaf.NotAfter)
}
cert.Leaf = leaf
}
return &cert, nil
}
// recordAndNotify records metrics, marks the domain ready, logs cert details,
// and notifies the cert notifier.
func (mgr *Manager) recordAndNotify(ctx context.Context, d domain.Domain, name string, cert *tls.Certificate, elapsed time.Duration) {
if elapsed > 0 && mgr.metrics != nil {
mgr.metrics.RecordCertificateIssuance(elapsed)
}
mgr.setDomainState(d, domainReady, "")
now := time.Now()
if cert != nil && cert.Leaf != nil {
leaf := cert.Leaf
@@ -188,11 +275,9 @@ func (mgr *Manager) prefetchCertificate(d domain.Domain) {
} else {
mgr.logger.Infof("certificate for domain %q ready in %s", name, elapsed.Round(time.Millisecond))
}
mgr.mu.RLock()
info := mgr.domains[d]
mgr.mu.RUnlock()
if info != nil && mgr.certNotifier != nil {
if err := mgr.certNotifier.NotifyCertificateIssued(ctx, info.accountID, info.serviceID, name); err != nil {
mgr.logger.Warnf("notify certificate ready for domain %q: %v", name, err)