mirror of
https://github.com/netbirdio/netbird.git
synced 2026-03-31 06:24:18 -04:00
[proxy] read cert from disk if available instead of cert manager (#5574)
* **New Features** * Asynchronous certificate prefetch that races live issuance with periodic on-disk cache checks to surface certificates faster. * Centralized recording and notification when certificates become available. * New on-disk certificate reading and validation to allow immediate use of cached certs. * **Bug Fixes & Performance** * Optimized retrieval by polling disk while fetching in background to reduce latency. * Added cancellation and timeout handling to fail stalled certificate operations reliably.
This commit is contained in:
@@ -7,9 +7,12 @@ import (
|
||||
"encoding/asn1"
|
||||
"encoding/base64"
|
||||
"encoding/binary"
|
||||
"encoding/pem"
|
||||
"fmt"
|
||||
"math/rand/v2"
|
||||
"net"
|
||||
"slices"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -137,7 +140,12 @@ func (mgr *Manager) AddDomain(d domain.Domain, accountID, serviceID string) {
|
||||
// It acquires a distributed lock to prevent multiple replicas from issuing
|
||||
// duplicate ACME requests. The second replica will block until the first
|
||||
// finishes, then find the certificate in the cache.
|
||||
// ACME and periodic disk reads race; whichever produces a valid certificate
|
||||
// first wins. This handles cases where locking is unreliable and another
|
||||
// replica already wrote the cert to the shared cache.
|
||||
func (mgr *Manager) prefetchCertificate(d domain.Domain) {
|
||||
time.Sleep(time.Duration(rand.IntN(200)) * time.Millisecond)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
@@ -153,26 +161,105 @@ func (mgr *Manager) prefetchCertificate(d domain.Domain) {
|
||||
defer unlock()
|
||||
}
|
||||
|
||||
hello := &tls.ClientHelloInfo{
|
||||
ServerName: name,
|
||||
Conn: &dummyConn{ctx: ctx},
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
cert, err := mgr.GetCertificate(hello)
|
||||
elapsed := time.Since(start)
|
||||
if err != nil {
|
||||
mgr.logger.Warnf("prefetch certificate for domain %q in %s: %v", name, elapsed.String(), err)
|
||||
mgr.setDomainState(d, domainFailed, err.Error())
|
||||
if cert, err := mgr.readCertFromDisk(ctx, name); err == nil {
|
||||
mgr.logger.Infof("certificate for domain %q already on disk, skipping ACME", name)
|
||||
mgr.recordAndNotify(ctx, d, name, cert, 0)
|
||||
return
|
||||
}
|
||||
|
||||
if mgr.metrics != nil {
|
||||
// Run ACME in a goroutine so we can race it against periodic disk reads.
|
||||
// autocert uses its own internal context and cannot be cancelled externally.
|
||||
type acmeResult struct {
|
||||
cert *tls.Certificate
|
||||
err error
|
||||
}
|
||||
acmeCh := make(chan acmeResult, 1)
|
||||
hello := &tls.ClientHelloInfo{ServerName: name, Conn: &dummyConn{ctx: ctx}}
|
||||
go func() {
|
||||
cert, err := mgr.GetCertificate(hello)
|
||||
acmeCh <- acmeResult{cert, err}
|
||||
}()
|
||||
|
||||
start := time.Now()
|
||||
diskTicker := time.NewTicker(5 * time.Second)
|
||||
defer diskTicker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case res := <-acmeCh:
|
||||
elapsed := time.Since(start)
|
||||
if res.err != nil {
|
||||
mgr.logger.Warnf("prefetch certificate for domain %q in %s: %v", name, elapsed.String(), res.err)
|
||||
mgr.setDomainState(d, domainFailed, res.err.Error())
|
||||
return
|
||||
}
|
||||
mgr.recordAndNotify(ctx, d, name, res.cert, elapsed)
|
||||
return
|
||||
|
||||
case <-diskTicker.C:
|
||||
cert, err := mgr.readCertFromDisk(context.Background(), name)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
mgr.logger.Infof("certificate for domain %q appeared on disk after %s", name, time.Since(start).Round(time.Millisecond))
|
||||
// Drain the ACME goroutine before marking ready — autocert holds
|
||||
// an internal write lock on certState while ACME is in flight.
|
||||
go func() {
|
||||
select {
|
||||
case <-acmeCh:
|
||||
default:
|
||||
}
|
||||
mgr.recordAndNotify(context.Background(), d, name, cert, 0)
|
||||
}()
|
||||
return
|
||||
|
||||
case <-ctx.Done():
|
||||
mgr.logger.Warnf("prefetch certificate for domain %q timed out", name)
|
||||
mgr.setDomainState(d, domainFailed, ctx.Err().Error())
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// readCertFromDisk reads and parses a certificate directly from the autocert
|
||||
// DirCache, bypassing autocert's internal certState mutex. Safe to call
|
||||
// concurrently with an in-flight ACME request for the same domain.
|
||||
func (mgr *Manager) readCertFromDisk(ctx context.Context, name string) (*tls.Certificate, error) {
|
||||
if mgr.Cache == nil {
|
||||
return nil, fmt.Errorf("no cache configured")
|
||||
}
|
||||
data, err := mgr.Cache.Get(ctx, name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
privBlock, certsPEM := pem.Decode(data)
|
||||
if privBlock == nil || !strings.Contains(privBlock.Type, "PRIVATE") {
|
||||
return nil, fmt.Errorf("no private key in cache for %q", name)
|
||||
}
|
||||
cert, err := tls.X509KeyPair(certsPEM, pem.EncodeToMemory(privBlock))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse cached certificate for %q: %w", name, err)
|
||||
}
|
||||
if len(cert.Certificate) > 0 {
|
||||
leaf, err := x509.ParseCertificate(cert.Certificate[0])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse leaf for %q: %w", name, err)
|
||||
}
|
||||
if time.Now().After(leaf.NotAfter) {
|
||||
return nil, fmt.Errorf("cached certificate for %q expired at %s", name, leaf.NotAfter)
|
||||
}
|
||||
cert.Leaf = leaf
|
||||
}
|
||||
return &cert, nil
|
||||
}
|
||||
|
||||
// recordAndNotify records metrics, marks the domain ready, logs cert details,
|
||||
// and notifies the cert notifier.
|
||||
func (mgr *Manager) recordAndNotify(ctx context.Context, d domain.Domain, name string, cert *tls.Certificate, elapsed time.Duration) {
|
||||
if elapsed > 0 && mgr.metrics != nil {
|
||||
mgr.metrics.RecordCertificateIssuance(elapsed)
|
||||
}
|
||||
|
||||
mgr.setDomainState(d, domainReady, "")
|
||||
|
||||
now := time.Now()
|
||||
if cert != nil && cert.Leaf != nil {
|
||||
leaf := cert.Leaf
|
||||
@@ -188,11 +275,9 @@ func (mgr *Manager) prefetchCertificate(d domain.Domain) {
|
||||
} else {
|
||||
mgr.logger.Infof("certificate for domain %q ready in %s", name, elapsed.Round(time.Millisecond))
|
||||
}
|
||||
|
||||
mgr.mu.RLock()
|
||||
info := mgr.domains[d]
|
||||
mgr.mu.RUnlock()
|
||||
|
||||
if info != nil && mgr.certNotifier != nil {
|
||||
if err := mgr.certNotifier.NotifyCertificateIssued(ctx, info.accountID, info.serviceID, name); err != nil {
|
||||
mgr.logger.Warnf("notify certificate ready for domain %q: %v", name, err)
|
||||
|
||||
Reference in New Issue
Block a user