Grosse refonte

This commit is contained in:
Mickael BOURNEUF 2025-03-01 18:41:36 +01:00
parent 76e8d66875
commit 61c54f3d8f
90 changed files with 11024 additions and 356 deletions

View File

@ -1,22 +1,11 @@
package domain
import (
"context"
"encoding/json"
"fmt"
"log"
"time"
"go.uber.org/zap"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"libvirt.org/go/libvirt"
"deevirt.fr/compute/pkg/api/proto"
"deevirt.fr/compute/pkg/api/raft"
"deevirt.fr/compute/pkg/config"
"deevirt.fr/compute/pkg/scheduler"
deevirt_schema "deevirt.fr/compute/pkg/schema/deevirt"
"deevirt.fr/compute/pkg/raft"
)
type Domain struct {
@ -26,8 +15,8 @@ type Domain struct {
proto.UnimplementedDomainServer
}
func (d *Domain) connectNode(NodeId string) (*libvirt.Connect, error) {
var jCluster deevirt_schema.NodeStore
/*func (d *Domain) connectNode(NodeId string) (*libvirt.Connect, error) {
var jCluster schema.NodeStore
cluster, _ := d.Store.Get("/etc/libvirt/cluster")
json.Unmarshal(cluster, &jCluster)
@ -51,7 +40,7 @@ func (d *Domain) connectDomain(ctx context.Context, domainID string) (string, *l
DomainId: domainID,
})
var jCluster deevirt_schema.NodeStore
var jCluster schema.NodeStore
cluster, _ := d.Store.Get("/etc/libvirt/cluster")
json.Unmarshal(cluster, &jCluster)
@ -72,15 +61,15 @@ func (d *Domain) List(ctx context.Context, in *proto.DomainListAllRequest) (*pro
}
for domId, data := range domains {
domData := deevirt_schema.DomainStore{}
domData := schema.Domain{}
json.Unmarshal(data, &domData)
nodeData, _ := d.Store.Get(fmt.Sprintf("/etc/libvirt/%s/%s/%s", domData.Type, domData.NodeId, domId))
domNodeData := deevirt_schema.DomainToNodeStore{}
domNodeData := schema.DomainToNode{}
json.Unmarshal(nodeData, &domNodeData)
domainsListResponse = append(domainsListResponse, &proto.DomainListResponse{
NodeId: domData.NodeId,
//NodeId: domData.NodeId,
DomainId: domId,
Config: string(domData.Config),
State: int64(domNodeData.State),
@ -100,11 +89,11 @@ func (d *Domain) Get(ctx context.Context, req *proto.DomainListRequest) (*proto.
return nil, status.Errorf(codes.Internal, "Error read a store %v", err)
}
domData := deevirt_schema.DomainStore{}
domData := deevirt_schema.Domain{}
json.Unmarshal(domain, &domData)
nodeData, _ := d.Store.Get(fmt.Sprintf("/etc/libvirt/%s/%s/%s", domData.Type, domData.NodeId, req.DomainId))
domNodeData := deevirt_schema.DomainToNodeStore{}
domNodeData := deevirt_schema.DomainToNode{}
json.Unmarshal(nodeData, &domNodeData)
domainsListResponse = proto.DomainListResponse{
@ -198,4 +187,4 @@ func (d *Domain) Migrate(in *proto.DomainMigrateRequest, stream proto.Domain_Mig
time.Sleep(500 * time.Millisecond)
}
}
}
}*/

View File

@ -3,15 +3,8 @@ package domain
import (
"context"
"encoding/json"
"encoding/xml"
"fmt"
"log"
"strings"
"deevirt.fr/compute/pkg/amqp"
"deevirt.fr/compute/pkg/api/proto"
"deevirt.fr/compute/pkg/schema"
deevirt_schema "deevirt.fr/compute/pkg/schema/deevirt"
"libvirt.org/go/libvirt"
)
@ -20,7 +13,7 @@ type EventsDetail map[string]string
func (d *Domain) domainEventLifecycle(nodeId string, domainId string, state int64, event *libvirt.DomainEventLifecycle) {
d.Logger.Sugar().Infof("%s => %s: Evènement %v", nodeId, domainId, event)
domStore := deevirt_schema.DomainStore{}
/*domStore := schema.Domain{}
domData, err := d.Store.Get(fmt.Sprintf("/etc/libvirt/domain/%s", domainId))
if err != nil || len(domData) == 0 {
d.Logger.Sugar().Errorf("Critique !!, la VM %s n'existe pas ou comporte une erreur importante !", domainId)
@ -31,7 +24,7 @@ func (d *Domain) domainEventLifecycle(nodeId string, domainId string, state int6
case libvirt.DOMAIN_EVENT_DEFINED:
// Changement de noeud !
oldNodeId := strings.Clone(domStore.NodeId)
dom2node, _ := json.Marshal(deevirt_schema.DomainToNodeStore{
dom2node, _ := json.Marshal(deevirt_schema.DomainToNode{
State: int(state),
})
d.Store.Set(fmt.Sprintf("/etc/libvirt/domain/qemu/%s/%s", nodeId, domainId), dom2node)
@ -65,7 +58,7 @@ func (d *Domain) domainEventLifecycle(nodeId string, domainId string, state int6
// MAJ de l'état
nodeData, _ := d.Store.Get(fmt.Sprintf("/etc/libvirt/qemu/%s/%s", nodeId, domainId))
domNodeData := deevirt_schema.DomainToNodeStore{}
domNodeData := deevirt_schema.DomainToNode{}
json.Unmarshal(nodeData, &domNodeData)
domNodeData.State = int(state)
@ -92,7 +85,7 @@ func (d *Domain) domainEventLifecycle(nodeId string, domainId string, state int6
"."+desc.Metadata.DeevirtInstance.DeevirtDatacenterID+
"."+domainId,
e)
defer a.Close()
defer a.Close()*/
}
func (d *Domain) Event(ctx context.Context, req *proto.DomainEventRequest) (*proto.DomainEventResponse, error) {

View File

@ -1,9 +1,108 @@
package main
import (
"deevirt.fr/compute/pkg/api"
"crypto/tls"
"crypto/x509"
"fmt"
"log"
"net"
"os"
"time"
raft_hashicorp "github.com/hashicorp/raft"
"go.uber.org/zap"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/reflection"
"deevirt.fr/compute/cmd/mgr/domain"
"deevirt.fr/compute/cmd/mgr/node"
"deevirt.fr/compute/cmd/mgr/worker"
pb "deevirt.fr/compute/pkg/api/proto"
"deevirt.fr/compute/pkg/config"
"deevirt.fr/compute/pkg/raft"
)
func main() {
api.Server()
func createGRPCServer(conf *config.Config) *grpc.Server {
if conf.Manager.TlsKey != "" {
cert, err := tls.LoadX509KeyPair(conf.Manager.TlsCert, conf.Manager.TlsKey)
if err != nil {
log.Fatalf("Erreur chargement du certificat: %v", err)
}
// Charger la CA (facultatif, pour la vérification des clients)
caCert, err := os.ReadFile(conf.Manager.TlsCert)
if err != nil {
log.Fatalf("Erreur chargement CA: %v", err)
}
certPool := x509.NewCertPool()
certPool.AppendCertsFromPEM(caCert)
// Créer les credentials TLS
creds := credentials.NewTLS(&tls.Config{
Certificates: []tls.Certificate{cert},
ClientCAs: certPool,
ClientAuth: tls.RequireAndVerifyClientCert, // Authentification mutuelle (mTLS),
})
return grpc.NewServer(grpc.Creds(creds))
}
return grpc.NewServer()
}
func main() {
logger, _ := zap.NewProduction()
// Récupération de la configuration deevirt
conf, err := config.New()
if err != nil {
log.Fatalf("failed load configuration: %v", err)
}
sock, err := net.Listen("tcp", fmt.Sprintf(":%d", 4480))
if err != nil {
log.Fatalf("failed to listen: %v", err)
}
r := raft.New(conf)
s, tm, err := r.Open()
if err != nil {
log.Fatalf("failed to start raft: %v", err)
}
// Observer pour surveiller les changements d'état
stateCh := make(chan raft_hashicorp.Observation, 1) // Canal de type raft.Observation
s.Raft.RegisterObserver(raft_hashicorp.NewObserver(stateCh, true, nil))
nodes := &worker.RaftNode{
Bootstrap: false,
Store: s,
NodeID: conf.NodeID,
StateCh: stateCh,
}
go nodes.WatchStateChanges()
// On temporise 5 secondes, le temps de laisser la reprise des logs
time.Sleep(5 * time.Second)
server := createGRPCServer(conf)
pb.RegisterNodeServer(server, &node.Node{
Config: conf,
Store: r,
})
pb.RegisterDomainServer(server, &domain.Domain{
Config: conf,
Store: r,
Logger: logger,
})
tm.Register(server)
//leaderhealth.Setup(r, s, []string{"Example"})
raft.Register(server, r.Raft)
reflection.Register(server)
if err := server.Serve(sock); err != nil {
log.Fatalf("failed to serve: %v", err)
}
}

View File

@ -2,13 +2,14 @@ package node
import (
"encoding/json"
"fmt"
"io"
"log"
"deevirt.fr/compute/pkg/api/proto"
"deevirt.fr/compute/pkg/api/raft"
"deevirt.fr/compute/pkg/config"
deevirt_schema "deevirt.fr/compute/pkg/schema/deevirt"
"deevirt.fr/compute/pkg/raft"
"deevirt.fr/compute/pkg/schema"
)
type Node struct {
@ -29,9 +30,11 @@ func (n *Node) Alive(stream proto.Node_AliveServer) error {
log.Printf("Received heartbeat: %v", req)
cluster := deevirt_schema.NodeStore{}
cluster := schema.NodeStore{}
println("on reçit une demande")
res, _ := n.Store.Get("/etc/libvirt/cluster")
json.Unmarshal(res, &cluster)
fmt.Printf("%v\n", res)
cluster[n.Config.NodeID].LastUpdate = req.Timestamp
d, _ := json.Marshal(cluster)

View File

@ -1,43 +1,47 @@
package raft
package worker
import (
"encoding/base64"
"encoding/json"
"fmt"
"log"
"time"
raft_hashicorp "github.com/hashicorp/raft"
"google.golang.org/protobuf/types/known/timestamppb"
"libvirt.org/go/libvirt"
"deevirt.fr/compute/pkg/config"
etcd_client "deevirt.fr/compute/pkg/etcd"
deevirt_schema "deevirt.fr/compute/pkg/schema/deevirt"
"deevirt.fr/compute/pkg/raft"
"deevirt.fr/compute/pkg/schema"
//"deevirt.fr/compute/pkg/scheduler"
)
type RaftNode struct {
Bootstrap bool
Raft *raft_hashicorp.Raft
Store *Store
Conf *config.Config
Store *raft.Store
NodeID string
StateCh chan raft_hashicorp.Observation
}
func (n *RaftNode) init() {
println("bootstrap :")
nodes := make(deevirt_schema.NodeStore)
nodes := make(schema.NodeStore)
// Récupération des Noeuds ID
etcd, _ := etcd_client.New(n.Store.conf.EtcdURI)
etcd, _ := etcd_client.New(n.Conf.EtcdURI)
defer etcd.Close()
for key, value := range etcd_client.GetNodes(etcd, n.Store.conf.ClusterID) {
for key, value := range etcd_client.GetNodes(etcd, n.Conf.ClusterID) {
var libvirt_uri string
nodes[key] = &deevirt_schema.NodeStoreInfo{
nodes[key] = &schema.NodeStoreInfo{
IpManagement: value.IpManagement,
}
if n.Store.conf.LibvirtTLS {
if n.Conf.LibvirtTLS {
libvirt_uri = fmt.Sprintf("qemu+tls://%s/system", value.IpManagement)
} else {
libvirt_uri = fmt.Sprintf("qemu+tcp://%s/system", value.IpManagement)
@ -49,28 +53,42 @@ func (n *RaftNode) init() {
}
defer c.Close()
// On récupère la liste des domaines.
getDomains, _ := c.ListAllDomains(libvirt.CONNECT_LIST_DOMAINS_PERSISTENT)
for _, domain := range getDomains {
conf, _ := domain.GetXMLDesc(libvirt.DOMAIN_XML_INACTIVE)
uuid, _ := domain.GetUUIDString()
state, _, _ := domain.GetState()
dStore, _ := json.Marshal(deevirt_schema.DomainStore{
NodeId: key,
Type: "qemu",
Config: []byte(conf),
})
n.Store.Set(fmt.Sprintf("/etc/libvirt/domain/%s", uuid), dStore)
// On enregistre la configuration
domainStore, _ := json.Marshal(schema.Domain{
dDomainToNode, _ := json.Marshal(deevirt_schema.DomainToNodeStore{
State: int(state),
Type: "qemu",
Config: base64.StdEncoding.EncodeToString([]byte(conf)),
})
n.Store.Set(fmt.Sprintf("/etc/libvirt/qemu/%s/%s", key, uuid), dDomainToNode)
n.Store.Set(fmt.Sprintf("/domain/%s", uuid), domainStore)
// On enregistre le noeud
domainStateStore, _ := json.Marshal(schema.DomainNode{
NodeId: key,
State: int(state),
})
n.Store.Set(fmt.Sprintf("/domain/%s/node", uuid), domainStateStore)
// On associe au noeud
currentTime := time.Now()
newTime := currentTime.Add(3600 * time.Second) // On ajoute 3600 secondes pour permettre au moniteur de se synchroniser
DomainLibvirtStore, _ := json.Marshal(schema.DomainLock{
LifeCycle: int(state),
Expiry: timestamppb.New(newTime),
})
n.Store.Set(fmt.Sprintf("/etc/libvirt/qemu/%s/%s", key, uuid), DomainLibvirtStore)
}
}
jNodes, _ := json.Marshal(nodes)
n.Store.Set("/etc/libvirt/cluster", jNodes)
n.Store.Set("/cluster", jNodes)
}
// Fonction pour surveiller et afficher les changements d'état
@ -80,33 +98,35 @@ func (n *RaftNode) WatchStateChanges() {
for obs := range n.StateCh {
switch evt := obs.Data.(type) {
case raft_hashicorp.RaftState:
log.Println("[ÉVÉNEMENT] Changement d'état Raft :", evt)
if evt == raft_hashicorp.Leader {
log.Println("[ÉVÉNEMENT] Changement d'état Raft :", evt)
if n.Bootstrap {
n.init()
}
// On attend une seconde avant de démarrer le worker
time.Sleep(1 * time.Second)
// On attend que les logs soient synchronisés !
barrier := n.Store.Raft.Barrier(10 * time.Second)
if err := barrier.Error(); err != nil {
return
}
log.Println("Démarrage du worker !")
worker.Start()
} else {
worker.Stop()
}
log.Println("[ÉVÉNEMENT] Changement d'état Raft :", evt)
case raft_hashicorp.LeaderObservation:
log.Println("[ÉVÉNEMENT] Le leader est", evt.LeaderID)
case raft_hashicorp.PeerObservation:
if n.Raft.State() == raft_hashicorp.Leader {
if n.Store.Raft.State() == raft_hashicorp.Leader {
peerID := evt.Peer.ID
peerAddr := evt.Peer.Address
log.Println("[NOUVEAU NŒUD] Détection de", peerID, "à", peerAddr)
log.Println("[ACTION] Ajout automatique en tant que voter...")
future := n.Raft.AddVoter(peerID, peerAddr, 0, 0)
future := n.Store.Raft.AddVoter(peerID, peerAddr, 0, 0)
if err := future.Error(); err != nil {
log.Println("[ERREUR] Impossible d'ajouter", peerID, ":", err)
} else {

View File

@ -1,4 +1,4 @@
package raft
package worker
import (
"context"
@ -11,10 +11,11 @@ import (
"go.uber.org/zap"
"deevirt.fr/compute/pkg/api/libvirt"
"deevirt.fr/compute/pkg/config"
"deevirt.fr/compute/pkg/scheduler"
deevirt_schema "deevirt.fr/compute/pkg/schema/deevirt"
"deevirt.fr/compute/pkg/libvirt"
scheduler "deevirt.fr/compute/pkg/metrics"
"deevirt.fr/compute/pkg/raft"
"deevirt.fr/compute/pkg/schema"
)
type Worker struct {
@ -22,14 +23,14 @@ type Worker struct {
cancel context.CancelFunc
cancelled bool
store *Store
store *raft.Store
config *config.Config
nodes deevirt_schema.NodeStore
nodes schema.NodeStore
log *zap.SugaredLogger
}
func NewWorker(r *Store) (*Worker, error) {
func NewWorker(r *raft.Store) (*Worker, error) {
config, _ := config.New()
ctx, cancel := context.WithCancel(context.Background())
@ -133,7 +134,7 @@ On controle périodiquement l'accessibilité à libvirt, indépendamment du prog
Cette vérification assure un double controle pour la HA.
*/
func (w *Worker) handleLibvirtControl() {
var nodes deevirt_schema.NodeStore
var nodes schema.NodeStore
cluster, err := w.store.Get("/etc/libvirt/cluster")
if err != nil {
w.log.Errorf("Erreur lors de la récupération des données de cluster: %v", err)
@ -149,7 +150,7 @@ func (w *Worker) handleLibvirtControl() {
for _, conf := range nodes {
// Créer une connexion à libvirt
c, err := libvirt.New(conf.IpManagement, w.store.conf.LibvirtTLS)
c, err := libvirt.New(conf.IpManagement, w.config.LibvirtTLS)
if err != nil {
w.log.Warnf("Impossible de créer la connexion libvirt pour %s: %v", conf.IpManagement, err)
//conf.Alive = false

View File

@ -17,13 +17,13 @@ import (
"deevirt.fr/compute/pkg/api/libvirt"
pb "deevirt.fr/compute/pkg/api/proto"
"deevirt.fr/compute/pkg/config"
deevirt_schema "deevirt.fr/compute/pkg/schema/deevirt"
"deevirt.fr/compute/pkg/schema"
)
type qemu struct {
clientVirt *go_libvirt.Connect
config *config.Config
nodes deevirt_schema.NodeStore
nodes schema.NodeStore
}
func NewQemu(c *go_libvirt.Connect) qemu {
@ -47,13 +47,18 @@ func (q qemu) stonith(ctx context.Context) {
log.Printf("L'accessibilité avec les manager est revenue, la procédure d'urgence est avortée.")
return
case <-time.After(10 * time.Second):
// On controle l'accessibilité des autres serveurs via libvirt, si un serveur est accessible, on peut supposer un problème avec le manager
for _, domData := range q.nodes {
_, err := libvirt.New(domData.IpManagement, q.config.LibvirtTLS)
if err == nil {
log.Printf("Au moins un noeud est joignable, la procédure d'urgence est avortée.")
return
if len(q.nodes) > 0 {
// On controle l'accessibilité des autres serveurs via libvirt, si un serveur est accessible, on peut supposer un problème avec le manager
for _, domData := range q.nodes {
_, err := libvirt.New(domData.IpManagement, q.config.LibvirtTLS)
if err == nil {
log.Printf("Au moins un noeud est joignable, la procédure d'urgence est avortée.")
return
}
}
} else {
log.Printf("Le noeud est indépendant, on avorte")
return
}
// Manager inaccessible et autres noeuds libvirt aussi
@ -110,7 +115,7 @@ func (q qemu) heartbeat() {
log.Println("🔌 Connexion fermée par le serveur")
break
} else {
nodeStore := deevirt_schema.NodeStore{}
nodeStore := schema.NodeStore{}
json.Unmarshal(resp.Nodes, &nodeStore)
q.nodes = nodeStore
}
@ -140,7 +145,7 @@ func (q qemu) heartbeat() {
}
time.Sleep(1 * time.Second)
time.Sleep(100 * time.Millisecond)
}
}

View File

@ -240,7 +240,7 @@ func CollectDomain(ch chan<- prometheus.Metric, stat libvirt.DomainStats, hostna
if err != nil {
return err
}
var desc schema.Domain
var desc schema.DomainXML
err = xml.Unmarshal([]byte(xmlDesc), &desc)
if err != nil {
return err
@ -297,7 +297,7 @@ func CollectDomain(ch chan<- prometheus.Metric, stat libvirt.DomainStats, hostna
return nil
}
func CollectDomainVCPU(ch chan<- prometheus.Metric, stat []libvirt.DomainStatsVcpu, hostname string, domainUUID string, desc schema.Domain) {
func CollectDomainVCPU(ch chan<- prometheus.Metric, stat []libvirt.DomainStatsVcpu, hostname string, domainUUID string, desc schema.DomainXML) {
for idx, vcpu := range stat {
ch <- prometheus.MustNewConstMetric(
libvirtDomainVcpuState,
@ -335,7 +335,7 @@ func CollectDomainVCPU(ch chan<- prometheus.Metric, stat []libvirt.DomainStatsVc
}
}
func CollectDomainBalloon(ch chan<- prometheus.Metric, stat *libvirt.DomainStatsBalloon, hostname string, domainUUID string, desc schema.Domain) {
func CollectDomainBalloon(ch chan<- prometheus.Metric, stat *libvirt.DomainStatsBalloon, hostname string, domainUUID string, desc schema.DomainXML) {
ch <- prometheus.MustNewConstMetric(
libvirtDomainBalloonStatCurrentBytes,
prometheus.GaugeValue,
@ -416,7 +416,7 @@ func CollectDomainBalloon(ch chan<- prometheus.Metric, stat *libvirt.DomainStats
}
func CollectDomainBlock(ch chan<- prometheus.Metric, stat []libvirt.DomainStatsBlock, hostname string, domainUUID string, desc schema.Domain) {
func CollectDomainBlock(ch chan<- prometheus.Metric, stat []libvirt.DomainStatsBlock, hostname string, domainUUID string, desc schema.DomainXML) {
for _, block := range stat {
if block.RdBytesSet {
@ -532,7 +532,7 @@ func CollectDomainBlock(ch chan<- prometheus.Metric, stat []libvirt.DomainStatsB
}
}
func CollectDomainNet(ch chan<- prometheus.Metric, stat []libvirt.DomainStatsNet, hostname string, domainUUID string, desc schema.Domain) {
func CollectDomainNet(ch chan<- prometheus.Metric, stat []libvirt.DomainStatsNet, hostname string, domainUUID string, desc schema.DomainXML) {
for _, iface := range stat {
if iface.RxBytesSet {

6
go.mod
View File

@ -24,10 +24,14 @@ require (
require (
github.com/armon/go-metrics v0.4.1 // indirect
github.com/benbjohnson/immutable v0.4.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/boltdb/bolt v1.3.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/coreos/etcd v3.3.27+incompatible // indirect
github.com/coreos/go-semver v0.3.1 // indirect
github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf // indirect
github.com/coreos/pkg v0.0.0-20220810130054-c7d1c02cb6cf // indirect
github.com/fatih/color v1.18.0 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/protobuf v1.5.4 // indirect
@ -38,6 +42,7 @@ require (
github.com/hashicorp/go-msgpack v1.1.5 // indirect
github.com/hashicorp/go-msgpack/v2 v2.1.3 // indirect
github.com/hashicorp/golang-lru v1.0.2 // indirect
github.com/hashicorp/raft-wal v0.4.2 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.18.0 // indirect
github.com/mattn/go-colorable v0.1.14 // indirect
@ -51,6 +56,7 @@ require (
go.etcd.io/etcd/api/v3 v3.5.18 // indirect
go.etcd.io/etcd/client/pkg/v3 v3.5.18 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/exp v0.0.0-20220827204233-334a2380cb91 // indirect
golang.org/x/net v0.35.0 // indirect
golang.org/x/oauth2 v0.27.0 // indirect
golang.org/x/sys v0.30.0 // indirect

12
go.sum
View File

@ -7,6 +7,8 @@ github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRF
github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho=
github.com/armon/go-metrics v0.4.1 h1:hR91U9KYmb6bLBYLQjyM+3j+rcd/UhE+G78SFnF8gJA=
github.com/armon/go-metrics v0.4.1/go.mod h1:E6amYzXo6aW1tqzoZGT755KkbgrJsSdpwZ+3JqfkOG4=
github.com/benbjohnson/immutable v0.4.0 h1:CTqXbEerYso8YzVPxmWxh2gnoRQbbB9X1quUC8+vGZA=
github.com/benbjohnson/immutable v0.4.0/go.mod h1:iAr8OjJGLnLmVUr9MZ/rz4PWUy6Ouc2JLYuMArmvAJM=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
@ -18,10 +20,16 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag=
github.com/circonus-labs/circonusllhist v0.1.3/go.mod h1:kMXHVDlOchFAehlya5ePtbp5jckzBHf4XRpQvBOLI+I=
github.com/coreos/etcd v3.3.27+incompatible h1:QIudLb9KeBsE5zyYxd1mjzRSkzLg9Wf9QlRwFgd6oTA=
github.com/coreos/etcd v3.3.27+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
github.com/coreos/go-semver v0.3.1 h1:yi21YpKnrx1gt5R+la8n5WgS0kCrsPp33dmEyHReZr4=
github.com/coreos/go-semver v0.3.1/go.mod h1:irMmmIw/7yzSRPWryHsK7EYSg09caPQL03VsM8rvUec=
github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf h1:iW4rZ826su+pqaw19uhpSCzhj44qo35pNgKFGqzDKkU=
github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs=
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/coreos/pkg v0.0.0-20220810130054-c7d1c02cb6cf h1:GOPo6vn/vTN+3IwZBvXX0y5doJfSC7My0cdzelyOCsQ=
github.com/coreos/pkg v0.0.0-20220810130054-c7d1c02cb6cf/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@ -96,6 +104,8 @@ github.com/hashicorp/raft-boltdb v0.0.0-20230125174641-2a8082862702 h1:RLKEcCuKc
github.com/hashicorp/raft-boltdb v0.0.0-20230125174641-2a8082862702/go.mod h1:nTakvJ4XYq45UXtn0DbwR4aU9ZdjlnIenpbs6Cd+FM0=
github.com/hashicorp/raft-boltdb/v2 v2.3.1 h1:ackhdCNPKblmOhjEU9+4lHSJYFkJd6Jqyvj6eW9pwkc=
github.com/hashicorp/raft-boltdb/v2 v2.3.1/go.mod h1:n4S+g43dXF1tqDT+yzcXHhXM6y7MrlUd3TTwGRcUvQE=
github.com/hashicorp/raft-wal v0.4.2 h1:DV1jgqEumNfdNpOaZ9mL1Gu7Mz59epFtiE6CoqnHrlY=
github.com/hashicorp/raft-wal v0.4.2/go.mod h1:S92ainH+6fRuWk6BtZKJ8EgcGgNTKx48Hk5dhOOY1DM=
github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
@ -216,6 +226,8 @@ golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnf
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/exp v0.0.0-20220827204233-334a2380cb91 h1:tnebWN09GYg9OLPss1KXj8txwZc6X6uMr6VFdcGNbHw=
golang.org/x/exp v0.0.0-20220827204233-334a2380cb91/go.mod h1:cyybsKvd6eL0RnXn6p/Grxp8F5bW7iYuBgsNCOHpMYE=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=

View File

@ -1,105 +0,0 @@
package raft
import (
"encoding/json"
"fmt"
"io"
"github.com/hashicorp/raft"
)
type Fsm Store
// Apply applies a Raft log entry to the key-value store.
func (f *Fsm) Apply(l *raft.Log) interface{} {
/*if f.Raft.State() == raft.Leader {
println("j'insert dans etcd !")
}*/
var c command
if err := json.Unmarshal(l.Data, &c); err != nil {
panic(fmt.Sprintf("failed to unmarshal command: %s", err.Error()))
}
switch c.Op {
case "set":
return f.applySet(c.Key, c.Value)
case "delete":
return f.applyDelete(c.Key)
default:
panic(fmt.Sprintf("unrecognized command op: %s", c.Op))
}
}
// Snapshot returns a snapshot of the key-value store.
func (f *Fsm) Snapshot() (raft.FSMSnapshot, error) {
f.mu.Lock()
defer f.mu.Unlock()
// Clone the map.
o := make(map[string][]byte)
for k, v := range f.m {
o[k] = v
}
return &fsmSnapshot{store: o}, nil
}
// Restore stores the key-value store to a previous state.
func (f *Fsm) Restore(rc io.ReadCloser) error {
o := make(map[string][]byte)
if err := json.NewDecoder(rc).Decode(&o); err != nil {
return err
}
// Set the state from the snapshot, no lock required according to
// Hashicorp docs.
f.m = o
return nil
}
func (f *Fsm) applySet(key string, value []byte) interface{} {
f.mu.Lock()
defer f.mu.Unlock()
f.m[key] = value
return nil
}
func (f *Fsm) applyDelete(key string) interface{} {
f.mu.Lock()
defer f.mu.Unlock()
delete(f.m, key)
return nil
}
type fsmSnapshot struct {
store map[string][]byte
}
func (f *fsmSnapshot) Persist(sink raft.SnapshotSink) error {
err := func() error {
// Encode data.
b, err := json.Marshal(f.store)
if err != nil {
return err
}
// Write data to sink.
if _, err := sink.Write(b); err != nil {
return err
}
// Close the sink.
return sink.Close()
}()
if err != nil {
sink.Cancel()
}
return err
}
func (f *fsmSnapshot) Release() {
}

View File

@ -1,26 +0,0 @@
package raft
/*type NodeStore map[string]*NodeStoreInfo
type NodeStoreInfo struct {
IpManagement string
Alive bool
Scoring int
}
type DomainStore struct {
Config string `json:"config"`
State int `json:"state"`
Migrate bool `json:"Migrate"`
}
type SchemaDomain struct {
Config string `json:"config"`
State int `json:"state"`
}*/
// Metrics
type DomainUsage struct {
DomID string
Usage float64
}

View File

@ -1,90 +0,0 @@
package api
import (
"crypto/tls"
"crypto/x509"
"fmt"
"log"
"net"
"os"
"go.uber.org/zap"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/reflection"
"deevirt.fr/compute/pkg/api/domain"
"deevirt.fr/compute/pkg/api/node"
pb "deevirt.fr/compute/pkg/api/proto"
"deevirt.fr/compute/pkg/api/raft"
"deevirt.fr/compute/pkg/config"
)
func createGRPCServer(conf *config.Config) *grpc.Server {
if conf.Manager.TlsKey != "" {
cert, err := tls.LoadX509KeyPair(conf.Manager.TlsCert, conf.Manager.TlsKey)
if err != nil {
log.Fatalf("Erreur chargement du certificat: %v", err)
}
// Charger la CA (facultatif, pour la vérification des clients)
caCert, err := os.ReadFile(conf.Manager.TlsCert)
if err != nil {
log.Fatalf("Erreur chargement CA: %v", err)
}
certPool := x509.NewCertPool()
certPool.AppendCertsFromPEM(caCert)
// Créer les credentials TLS
creds := credentials.NewTLS(&tls.Config{
Certificates: []tls.Certificate{cert},
ClientCAs: certPool,
ClientAuth: tls.RequireAndVerifyClientCert, // Authentification mutuelle (mTLS),
})
return grpc.NewServer(grpc.Creds(creds))
}
return grpc.NewServer()
}
func Server() {
logger, _ := zap.NewProduction()
// Récupération de la configuration deevirt
conf, err := config.New()
if err != nil {
log.Fatalf("failed load configuration: %v", err)
}
sock, err := net.Listen("tcp", fmt.Sprintf(":%d", 4480))
if err != nil {
log.Fatalf("failed to listen: %v", err)
}
r := raft.New(conf)
tm, err := r.Open()
if err != nil {
log.Fatalf("failed to start raft: %v", err)
}
s := createGRPCServer(conf)
pb.RegisterNodeServer(s, &node.Node{
Config: conf,
Store: r,
})
pb.RegisterDomainServer(s, &domain.Domain{
Config: conf,
Store: r,
Logger: logger,
})
tm.Register(s)
//leaderhealth.Setup(r, s, []string{"Example"})
raft.Register(s, r.Raft)
reflection.Register(s)
if err := s.Serve(sock); err != nil {
log.Fatalf("failed to serve: %v", err)
}
}

147
pkg/raft/fsm.go Normal file
View File

@ -0,0 +1,147 @@
package raft
import (
"context"
"encoding/json"
"fmt"
"io"
"regexp"
"time"
"github.com/hashicorp/raft"
clientv3 "go.etcd.io/etcd/client/v3"
)
type FSM struct {
store *Store
client *clientv3.Client
}
func NewFSM(endpoints []string, store *Store) (*FSM, error) {
// Se connecter au cluster etcd
client, err := clientv3.New(clientv3.Config{
Endpoints: endpoints,
DialTimeout: 5 * time.Second,
})
if err != nil {
return nil, err
}
return &FSM{
store: store,
client: client,
}, nil
}
// Apply applies a Raft log entry to the key-value store.
func (f *FSM) Apply(l *raft.Log) interface{} {
switch l.Type {
case raft.LogCommand:
var c command
if err := json.Unmarshal(l.Data, &c); err != nil {
panic(fmt.Sprintf("failed to unmarshal command: %s", err.Error()))
}
switch c.Op {
case "set":
f.applySet(c.Key, c.Value)
case "delete":
f.applyDelete(c.Key)
default:
panic(fmt.Sprintf("unrecognized command op: %s", c.Op))
}
// On réplique sur etcd si ce n'est pas une reprise des logs et si le noeud est leader
if l.Index > f.store.lastIndex && f.store.Raft.State() == raft.Leader {
regex := regexp.MustCompile(`^/domain`)
match := regex.MatchString(c.Key)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
if match {
switch c.Op {
case "set":
f.client.Put(ctx, fmt.Sprintf("/deevirt/cluster/%s%s", f.store.conf.ClusterID, c.Key), string(c.Value))
case "delete":
f.client.Delete(ctx, fmt.Sprintf("/deevirt/cluster/%s%s", f.store.conf.ClusterID, c.Key))
}
}
defer cancel()
}
default:
println(l.Type.String())
}
return nil
}
// Snapshot returns a snapshot of the key-value store.
func (f *FSM) Snapshot() (raft.FSMSnapshot, error) {
f.store.mu.Lock()
defer f.store.mu.Unlock()
// Clone the map.
o := make(map[string][]byte)
for k, v := range f.store.m {
o[k] = v
}
return &fsmSnapshot{store: o}, nil
}
// Restore stores the key-value store to a previous state.
func (f *FSM) Restore(rc io.ReadCloser) error {
o := make(map[string][]byte)
if err := json.NewDecoder(rc).Decode(&o); err != nil {
return err
}
// Set the state from the snapshot, no lock required according to
// Hashicorp docs.
f.store.m = o
return nil
}
func (f *FSM) applySet(key string, value []byte) interface{} {
f.store.mu.Lock()
defer f.store.mu.Unlock()
f.store.m[key] = value
return nil
}
func (f *FSM) applyDelete(key string) interface{} {
f.store.mu.Lock()
defer f.store.mu.Unlock()
delete(f.store.m, key)
return nil
}
type fsmSnapshot struct {
store map[string][]byte
}
func (f *fsmSnapshot) Persist(sink raft.SnapshotSink) error {
err := func() error {
// Encode data.
b, err := json.Marshal(f.store)
if err != nil {
return err
}
// Write data to sink.
if _, err := sink.Write(b); err != nil {
return err
}
// Close the sink.
return sink.Close()
}()
if err != nil {
sink.Cancel()
}
return err
}
func (f *fsmSnapshot) Release() {
}

View File

@ -13,9 +13,10 @@ import (
"sync"
"time"
transport "deevirt.fr/compute/pkg/api/raft/transport"
transport "deevirt.fr/compute/pkg/raft/transport"
"github.com/hashicorp/raft"
raftboltdb "github.com/hashicorp/raft-boltdb/v2"
raftwal "github.com/hashicorp/raft-wal"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
@ -52,6 +53,8 @@ type Store struct {
Raft *raft.Raft // The consensus mechanism
lastIndex uint64
logger *log.Logger
}
@ -92,30 +95,47 @@ func New(conf *config.Config) *Store {
}
}
func (s *Store) Open() (*transport.Manager, error) {
func (s *Store) Open() (*Store, *transport.Manager, error) {
// Création du répertoire
baseDir := filepath.Join("/var/lib/deevirt/mgr/", s.conf.NodeID)
err := os.MkdirAll(baseDir, 0740)
if err != nil {
return nil, err
return nil, nil, err
}
walDir := filepath.Join(baseDir, "/wal")
err = os.MkdirAll(walDir, 0740)
if err != nil {
return nil, nil, err
}
c := raft.DefaultConfig()
c.SnapshotInterval = 60 * time.Second
c.SnapshotThreshold = 1000
c.SnapshotThreshold = 500
c.HeartbeatTimeout = 2 * time.Second
c.ElectionTimeout = 3 * time.Second
c.LocalID = raft.ServerID(s.conf.NodeID)
ldb, err := raftboltdb.NewBoltStore(filepath.Join(baseDir, "logs.dat"))
// Créer un LogStore avec Raft-WAL
logStore, err := raftwal.Open(walDir)
if err != nil {
return nil, fmt.Errorf(`boltdb.NewBoltStore(%q): %v`, filepath.Join(baseDir, "logs.dat"), err)
log.Fatalf("Erreur lors de la création du LogStore Raft-WAL : %v", err)
}
s.lastIndex, err = logStore.LastIndex()
if err != nil {
log.Fatalf("Erreur lors de la récupération de l'index de la dernière entrée: %v", err)
}
stableStore, err := raftboltdb.NewBoltStore(filepath.Join(baseDir, "logs.dat"))
if err != nil {
return nil, nil, fmt.Errorf(`boltdb.NewBoltStore(%q): %v`, filepath.Join(baseDir, "logs.dat"), err)
}
fss, err := raft.NewFileSnapshotStore(baseDir, 3, os.Stderr)
if err != nil {
return nil, fmt.Errorf(`raft.NewFileSnapshotStore(%q, ...): %v`, baseDir, err)
return nil, nil, fmt.Errorf(`raft.NewFileSnapshotStore(%q, ...): %v`, baseDir, err)
}
dialOption := []grpc.DialOption{}
@ -126,36 +146,27 @@ func (s *Store) Open() (*transport.Manager, error) {
tm := transport.New(raft.ServerAddress(s.conf.AddressPrivate), dialOption)
r, err := raft.NewRaft(c, (*Fsm)(s), ldb, ldb, fss, tm.Transport())
fsm, err := NewFSM(strings.Split(s.conf.EtcdURI, ","), s)
if err != nil {
return nil, fmt.Errorf("raft.NewRaft: %v", err)
log.Fatalf("%v", err)
}
r, err := raft.NewRaft(c, fsm, logStore, stableStore, fss, tm.Transport())
if err != nil {
return nil, nil, fmt.Errorf("raft.NewRaft: %v", err)
}
s.Raft = r
// Observer pour surveiller les changements d'état
stateCh := make(chan raft.Observation, 1) // Canal de type raft.Observation
r.RegisterObserver(raft.NewObserver(stateCh, true, nil))
node := &RaftNode{
Bootstrap: false,
Raft: r,
Store: s,
NodeID: s.conf.NodeID,
StateCh: stateCh,
}
go node.WatchStateChanges()
hasState, _ := checkIfStateExists(ldb)
hasState, _ := checkIfStateExists(logStore)
if strings.Split(s.conf.AddressPrivate, ":")[0] == s.conf.AddressPrivate && !hasState {
println("Démarrage du bootstrap ! ")
node.Bootstrap = true
//node.Bootstrap = true
// Récupération des Noeuds ID
etcd, err := etcd_client.New(s.conf.EtcdURI)
if err != nil {
return nil, err
return nil, nil, err
}
defer etcd.Close()
@ -178,11 +189,11 @@ func (s *Store) Open() (*transport.Manager, error) {
}
f := r.BootstrapCluster(cfg)
if err := f.Error(); err != nil {
return nil, fmt.Errorf("raft.Raft.BootstrapCluster: %v", err)
return nil, nil, fmt.Errorf("raft.Raft.BootstrapCluster: %v", err)
}
}
return tm, nil
return s, tm, nil
}
type LsOptions struct {
@ -192,6 +203,11 @@ type LsOptions struct {
// Retourne le contenu de la clé
func (s *Store) Ls(key string, options LsOptions) (map[string][]byte, error) {
barrier := s.Raft.Barrier(10 * time.Second)
if err := barrier.Error(); err != nil {
return nil, fmt.Errorf("barrier timeout: %v", err)
}
s.mu.Lock()
defer s.mu.Unlock()
@ -227,6 +243,11 @@ func (s *Store) Ls(key string, options LsOptions) (map[string][]byte, error) {
// Get returns the value for the given key.
func (s *Store) Get(key string) ([]byte, error) {
barrier := s.Raft.Barrier(10 * time.Second)
if err := barrier.Error(); err != nil {
return nil, fmt.Errorf("barrier timeout: %v", err)
}
s.mu.Lock()
defer s.mu.Unlock()
return s.m[key], nil
@ -272,7 +293,7 @@ func (s *Store) Delete(key string) error {
}
// Vérifie si l'état Raft existe déjà
func checkIfStateExists(logStore *raftboltdb.BoltStore) (bool, error) {
func checkIfStateExists(logStore *raftwal.WAL) (bool, error) {
// Vérifier les logs Raft
firstIndex, err := logStore.FirstIndex()
if err != nil {

View File

@ -1,12 +0,0 @@
package deevirt_schema
// Schema dans le store
type DomainStore struct {
Type string `json:"type"`
NodeId string `json:"nodeID"`
Config []byte `json:"config"`
}
type DomainToNodeStore struct {
State int `json:"state"`
}

View File

@ -1,20 +1,59 @@
package schema
import "google.golang.org/protobuf/types/known/timestamppb"
/*
/domain/{domain_id}
*/
// Schema dans le store
type Domain struct {
Metadata Metadata `xml:"metadata"`
Type string `json:"type"` // Qemu seulement pour le moment
Config string `json:"config"` // La configuration xml libvirt
}
type Metadata struct {
DeevirtInstance Instance `xml:"instance"`
/*
/domain/{domain_id}/node
*/
type DomainNode struct {
NodeId string `json:"nodeID"` // NodeID Owner
State int `json:"state"` // Son etat persistant
}
type Instance struct {
/*
/domain/{domain_id}/agent
*/
type DomainAgent struct {
// A définir
}
/*
/etc/libvirt/{type}/{node_id}/{domain_id}
*/
type DomainLock struct {
LifeCycle int `json:"lifeycle"` // Son etat réel
Expiry *timestamppb.Timestamp `json:"expiry"` // Date d'expiration du verouillage
}
// Other
type DomainToNode struct {
State int `json:"state"`
}
type DomainXML struct {
Metadata MetadataXML `xml:"metadata"`
}
type MetadataXML struct {
DeevirtInstance InstanceXML `xml:"instance"`
}
type InstanceXML struct {
DeevirtCompanyID string `xml:"company_id"`
DeevirtDatacenterID string `xml:"datacenter_id"`
}
// JSON SCHEMA for AMQP
type DomainStateJSON struct {
type DomainStateAMQP struct {
CompanyID string `json:"company_id"`
DatacenterID string `json:"datacenter_id"`
DomainID string `json:"domain_id"`

View File

@ -1,4 +1,4 @@
package deevirt_schema
package schema
import "google.golang.org/protobuf/types/known/timestamppb"

19
vendor/github.com/benbjohnson/immutable/LICENSE generated vendored Normal file
View File

@ -0,0 +1,19 @@
Copyright 2019 Ben Johnson
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

301
vendor/github.com/benbjohnson/immutable/README.md generated vendored Normal file
View File

@ -0,0 +1,301 @@
Immutable ![release](https://img.shields.io/github/release/benbjohnson/immutable.svg) ![test](https://github.com/benbjohnson/immutable/workflows/test/badge.svg) ![coverage](https://img.shields.io/codecov/c/github/benbjohnson/immutable/master.svg) ![license](https://img.shields.io/github/license/benbjohnson/immutable.svg)
=========
This repository contains *generic* immutable collection types for Go. It includes
`List`, `Map`, and `SortedMap` implementations. Immutable collections can
provide efficient, lock free sharing of data by requiring that edits to the
collections return new collections.
The collection types in this library are meant to mimic Go built-in collections
such as`slice` and `map`. The primary usage difference between Go collections
and `immutable` collections is that `immutable` collections always return a new
collection on mutation so you will need to save the new reference.
Immutable collections are not for every situation, however, as they can incur
additional CPU and memory overhead. Please evaluate the cost/benefit for your
particular project.
Special thanks to the [Immutable.js](https://immutable-js.github.io/immutable-js/)
team as the `List` & `Map` implementations are loose ports from that project.
## List
The `List` type represents a sorted, indexed collection of values and operates
similarly to a Go slice. It supports efficient append, prepend, update, and
slice operations.
### Adding list elements
Elements can be added to the end of the list with the `Append()` method or added
to the beginning of the list with the `Prepend()` method. Unlike Go slices,
prepending is as efficient as appending.
```go
// Create a list with 3 elements.
l := immutable.NewList[string]()
l = l.Append("foo")
l = l.Append("bar")
l = l.Prepend("baz")
fmt.Println(l.Len()) // 3
fmt.Println(l.Get(0)) // "baz"
fmt.Println(l.Get(1)) // "foo"
fmt.Println(l.Get(2)) // "bar"
```
Note that each change to the list results in a new list being created. These
lists are all snapshots at that point in time and cannot be changed so they
are safe to share between multiple goroutines.
### Updating list elements
You can also overwrite existing elements by using the `Set()` method. In the
following example, we'll update the third element in our list and return the
new list to a new variable. You can see that our old `l` variable retains a
snapshot of the original value.
```go
l := immutable.NewList[string]()
l = l.Append("foo")
l = l.Append("bar")
newList := l.Set(2, "baz")
fmt.Println(l.Get(1)) // "bar"
fmt.Println(newList.Get(1)) // "baz"
```
### Deriving sublists
You can create a sublist by using the `Slice()` method. This method works with
the same rules as subslicing a Go slice:
```go
l = l.Slice(0, 2)
fmt.Println(l.Len()) // 2
fmt.Println(l.Get(0)) // "baz"
fmt.Println(l.Get(1)) // "foo"
```
Please note that since `List` follows the same rules as slices, it will panic if
you try to `Get()`, `Set()`, or `Slice()` with indexes that are outside of
the range of the `List`.
### Iterating lists
Iterators provide a clean, simple way to iterate over the elements of the list
in order. This is more efficient than simply calling `Get()` for each index.
Below is an example of iterating over all elements of our list from above:
```go
itr := l.Iterator()
for !itr.Done() {
index, value, _ := itr.Next()
fmt.Printf("Index %d equals %v\n", index, value)
}
// Index 0 equals baz
// Index 1 equals foo
```
By default iterators start from index zero, however, the `Seek()` method can be
used to jump to a given index.
### Efficiently building lists
If you are building large lists, it is significantly more efficient to use the
`ListBuilder`. It uses nearly the same API as `List` except that it updates
a list in-place until you are ready to use it. This can improve bulk list
building by 10x or more.
```go
b := immutable.NewListBuilder[string]()
b.Append("foo")
b.Append("bar")
b.Set(2, "baz")
l := b.List()
fmt.Println(l.Get(0)) // "foo"
fmt.Println(l.Get(1)) // "baz"
```
Builders are invalid after the call to `List()`.
## Map
The `Map` represents an associative array that maps unique keys to values. It
is implemented to act similarly to the built-in Go `map` type. It is implemented
as a [Hash-Array Mapped Trie](https://lampwww.epfl.ch/papers/idealhashtrees.pdf).
Maps require a `Hasher` to hash keys and check for equality. There are built-in
hasher implementations for most primitive types such as `int`, `uint`, `string`,
and `[]byte` keys. You may pass in a `nil` hasher to `NewMap()` if you are using
one of these key types.
### Setting map key/value pairs
You can add a key/value pair to the map by using the `Set()` method. It will
add the key if it does not exist or it will overwrite the value for the key if
it does exist.
Values may be fetched for a key using the `Get()` method. This method returns
the value as well as a flag indicating if the key existed. The flag is useful
to check if a `nil` value was set for a key versus a key did not exist.
```go
m := immutable.NewMap[string,int](nil)
m = m.Set("jane", 100)
m = m.Set("susy", 200)
m = m.Set("jane", 300) // overwrite
fmt.Println(m.Len()) // 2
v, ok := m.Get("jane")
fmt.Println(v, ok) // 300 true
v, ok = m.Get("susy")
fmt.Println(v, ok) // 200, true
v, ok = m.Get("john")
fmt.Println(v, ok) // nil, false
```
### Removing map keys
Keys may be removed from the map by using the `Delete()` method. If the key does
not exist then the original map is returned instead of a new one.
```go
m := immutable.NewMap[string,int](nil)
m = m.Set("jane", 100)
m = m.Delete("jane")
fmt.Println(m.Len()) // 0
v, ok := m.Get("jane")
fmt.Println(v, ok) // nil false
```
### Iterating maps
Maps are unsorted, however, iterators can be used to loop over all key/value
pairs in the collection. Unlike Go maps, iterators are deterministic when
iterating over key/value pairs.
```go
m := immutable.NewMap[string,int](nil)
m = m.Set("jane", 100)
m = m.Set("susy", 200)
itr := m.Iterator()
for !itr.Done() {
k, v := itr.Next()
fmt.Println(k, v)
}
// susy 200
// jane 100
```
Note that you should not rely on two maps with the same key/value pairs to
iterate in the same order. Ordering can be insertion order dependent when two
keys generate the same hash.
### Efficiently building maps
If you are executing multiple mutations on a map, it can be much more efficient
to use the `MapBuilder`. It uses nearly the same API as `Map` except that it
updates a map in-place until you are ready to use it.
```go
b := immutable.NewMapBuilder[string,int](nil)
b.Set("foo", 100)
b.Set("bar", 200)
b.Set("foo", 300)
m := b.Map()
fmt.Println(m.Get("foo")) // "300"
fmt.Println(m.Get("bar")) // "200"
```
Builders are invalid after the call to `Map()`.
### Implementing a custom Hasher
If you need to use a key type besides `int`, `uint`, `string`, or `[]byte` then
you'll need to create a custom `Hasher` implementation and pass it to `NewMap()`
on creation.
Hashers are fairly simple. They only need to generate hashes for a given key
and check equality given two keys.
```go
type Hasher[K constraints.Ordered] interface {
Hash(key K) uint32
Equal(a, b K) bool
}
```
Please see the internal `intHasher`, `uintHasher`, `stringHasher`, and
`byteSliceHasher` for examples.
## Sorted Map
The `SortedMap` represents an associative array that maps unique keys to values.
Unlike the `Map`, however, keys can be iterated over in-order. It is implemented
as a B+tree.
Sorted maps require a `Comparer` to sort keys and check for equality. There are
built-in comparer implementations for `int`, `uint`, `string`, and `[]byte` keys.
You may pass a `nil` comparer to `NewSortedMap()` if you are using one of these
key types.
The API is identical to the `Map` implementation. The sorted map also has a
companion `SortedMapBuilder` for more efficiently building maps.
### Implementing a custom Comparer
If you need to use a key type besides `int`, `uint`, `string`, or `[]byte`
then you'll need to create a custom `Comparer` implementation and pass it to
`NewSortedMap()` on creation.
Comparers on have one method—`Compare()`. It works the same as the
`strings.Compare()` function. It returns `-1` if `a` is less than `b`, returns
`1` if a is greater than `b`, and returns `0` if `a` is equal to `b`.
```go
type Comparer[K constraints.Ordered] interface {
Compare(a, b K) int
}
```
Please see the internal `intComparer`, `uintComparer`, `stringComparer`, and
`byteSliceComparer` for examples.
## Contributing
The goal of `immutable` is to provide stable, reasonably performant, immutable
collections library for Go that has a simple, idiomatic API. As such, additional
features and minor performance improvements will generally not be accepted. If
you have a suggestion for a clearer API or substantial performance improvement,
_please_ open an issue first to discuss. All pull requests without a related
issue will be closed immediately.
Please submit issues relating to bugs & documentation improvements.

2400
vendor/github.com/benbjohnson/immutable/immutable.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

202
vendor/github.com/coreos/etcd/LICENSE generated vendored Normal file
View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

27
vendor/github.com/coreos/etcd/pkg/fileutil/dir_unix.go generated vendored Normal file
View File

@ -0,0 +1,27 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !windows
package fileutil
import "os"
const (
// PrivateDirMode grants owner to make/remove files inside the directory.
PrivateDirMode = 0700
)
// OpenDir opens a directory for syncing.
func OpenDir(path string) (*os.File, error) { return os.Open(path) }

View File

@ -0,0 +1,51 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build windows
package fileutil
import (
"os"
"syscall"
)
const (
// PrivateDirMode grants owner to make/remove files inside the directory.
PrivateDirMode = 0777
)
// OpenDir opens a directory in windows with write access for syncing.
func OpenDir(path string) (*os.File, error) {
fd, err := openDir(path)
if err != nil {
return nil, err
}
return os.NewFile(uintptr(fd), path), nil
}
func openDir(path string) (fd syscall.Handle, err error) {
if len(path) == 0 {
return syscall.InvalidHandle, syscall.ERROR_FILE_NOT_FOUND
}
pathp, err := syscall.UTF16PtrFromString(path)
if err != nil {
return syscall.InvalidHandle, err
}
access := uint32(syscall.GENERIC_READ | syscall.GENERIC_WRITE)
sharemode := uint32(syscall.FILE_SHARE_READ | syscall.FILE_SHARE_WRITE)
createmode := uint32(syscall.OPEN_EXISTING)
fl := uint32(syscall.FILE_FLAG_BACKUP_SEMANTICS)
return syscall.CreateFile(pathp, access, sharemode, nil, createmode, fl, 0)
}

147
vendor/github.com/coreos/etcd/pkg/fileutil/fileutil.go generated vendored Normal file
View File

@ -0,0 +1,147 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package fileutil implements utility functions related to files and paths.
package fileutil
import (
"fmt"
"io"
"io/ioutil"
"os"
"path/filepath"
"sort"
"github.com/coreos/pkg/capnslog"
)
const (
// PrivateFileMode grants owner to read/write a file.
PrivateFileMode = 0600
)
var (
plog = capnslog.NewPackageLogger("github.com/coreos/etcd", "pkg/fileutil")
)
// IsDirWriteable checks if dir is writable by writing and removing a file
// to dir. It returns nil if dir is writable.
func IsDirWriteable(dir string) error {
f := filepath.Join(dir, ".touch")
if err := ioutil.WriteFile(f, []byte(""), PrivateFileMode); err != nil {
return err
}
return os.Remove(f)
}
// ReadDir returns the filenames in the given directory in sorted order.
func ReadDir(dirpath string) ([]string, error) {
dir, err := os.Open(dirpath)
if err != nil {
return nil, err
}
defer dir.Close()
names, err := dir.Readdirnames(-1)
if err != nil {
return nil, err
}
sort.Strings(names)
return names, nil
}
// TouchDirAll is similar to os.MkdirAll. It creates directories with 0700 permission if any directory
// does not exists. TouchDirAll also ensures the given directory is writable.
func TouchDirAll(dir string) error {
// If path is already a directory, MkdirAll does nothing and returns nil, so,
// first check if dir exist with an expected permission mode.
if Exist(dir) {
err := CheckDirPermission(dir, PrivateDirMode)
if err != nil {
plog.Warningf("check file permission: %v", err)
}
} else {
err := os.MkdirAll(dir, PrivateDirMode)
if err != nil {
// if mkdirAll("a/text") and "text" is not
// a directory, this will return syscall.ENOTDIR
return err
}
}
return IsDirWriteable(dir)
}
// CreateDirAll is similar to TouchDirAll but returns error
// if the deepest directory was not empty.
func CreateDirAll(dir string) error {
err := TouchDirAll(dir)
if err == nil {
var ns []string
ns, err = ReadDir(dir)
if err != nil {
return err
}
if len(ns) != 0 {
err = fmt.Errorf("expected %q to be empty, got %q", dir, ns)
}
}
return err
}
func Exist(name string) bool {
_, err := os.Stat(name)
return err == nil
}
// ZeroToEnd zeros a file starting from SEEK_CUR to its SEEK_END. May temporarily
// shorten the length of the file.
func ZeroToEnd(f *os.File) error {
// TODO: support FALLOC_FL_ZERO_RANGE
off, err := f.Seek(0, io.SeekCurrent)
if err != nil {
return err
}
lenf, lerr := f.Seek(0, io.SeekEnd)
if lerr != nil {
return lerr
}
if err = f.Truncate(off); err != nil {
return err
}
// make sure blocks remain allocated
if err = Preallocate(f, lenf, true); err != nil {
return err
}
_, err = f.Seek(off, io.SeekStart)
return err
}
// CheckDirPermission checks permission on an existing dir.
// Returns error if dir is empty or exist with a different permission than specified.
func CheckDirPermission(dir string, perm os.FileMode) error {
if !Exist(dir) {
return fmt.Errorf("directory %q empty, cannot check permission.", dir)
}
//check the existing permission on the directory
dirInfo, err := os.Stat(dir)
if err != nil {
return err
}
dirMode := dirInfo.Mode().Perm()
if dirMode != perm {
err = fmt.Errorf("directory %q exist, but the permission is %q. The recommended permission is %q to prevent possible unprivileged access to the data.", dir, dirInfo.Mode(), os.FileMode(PrivateDirMode))
return err
}
return nil
}

26
vendor/github.com/coreos/etcd/pkg/fileutil/lock.go generated vendored Normal file
View File

@ -0,0 +1,26 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fileutil
import (
"errors"
"os"
)
var (
ErrLocked = errors.New("fileutil: file already locked")
)
type LockedFile struct{ *os.File }

View File

@ -0,0 +1,49 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !windows,!plan9,!solaris
package fileutil
import (
"os"
"syscall"
)
func flockTryLockFile(path string, flag int, perm os.FileMode) (*LockedFile, error) {
f, err := os.OpenFile(path, flag, perm)
if err != nil {
return nil, err
}
if err = syscall.Flock(int(f.Fd()), syscall.LOCK_EX|syscall.LOCK_NB); err != nil {
f.Close()
if err == syscall.EWOULDBLOCK {
err = ErrLocked
}
return nil, err
}
return &LockedFile{f}, nil
}
func flockLockFile(path string, flag int, perm os.FileMode) (*LockedFile, error) {
f, err := os.OpenFile(path, flag, perm)
if err != nil {
return nil, err
}
if err = syscall.Flock(int(f.Fd()), syscall.LOCK_EX); err != nil {
f.Close()
return nil, err
}
return &LockedFile{f}, err
}

View File

@ -0,0 +1,97 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build linux
package fileutil
import (
"io"
"os"
"syscall"
)
// This used to call syscall.Flock() but that call fails with EBADF on NFS.
// An alternative is lockf() which works on NFS but that call lets a process lock
// the same file twice. Instead, use Linux's non-standard open file descriptor
// locks which will block if the process already holds the file lock.
//
// constants from /usr/include/bits/fcntl-linux.h
const (
F_OFD_GETLK = 36
F_OFD_SETLK = 37
F_OFD_SETLKW = 38
)
var (
wrlck = syscall.Flock_t{
Type: syscall.F_WRLCK,
Whence: int16(io.SeekStart),
Start: 0,
Len: 0,
}
linuxTryLockFile = flockTryLockFile
linuxLockFile = flockLockFile
)
func init() {
// use open file descriptor locks if the system supports it
getlk := syscall.Flock_t{Type: syscall.F_RDLCK}
if err := syscall.FcntlFlock(0, F_OFD_GETLK, &getlk); err == nil {
linuxTryLockFile = ofdTryLockFile
linuxLockFile = ofdLockFile
}
}
func TryLockFile(path string, flag int, perm os.FileMode) (*LockedFile, error) {
return linuxTryLockFile(path, flag, perm)
}
func ofdTryLockFile(path string, flag int, perm os.FileMode) (*LockedFile, error) {
f, err := os.OpenFile(path, flag, perm)
if err != nil {
return nil, err
}
flock := wrlck
if err = syscall.FcntlFlock(f.Fd(), F_OFD_SETLK, &flock); err != nil {
f.Close()
if err == syscall.EWOULDBLOCK {
err = ErrLocked
}
return nil, err
}
return &LockedFile{f}, nil
}
func LockFile(path string, flag int, perm os.FileMode) (*LockedFile, error) {
return linuxLockFile(path, flag, perm)
}
func ofdLockFile(path string, flag int, perm os.FileMode) (*LockedFile, error) {
f, err := os.OpenFile(path, flag, perm)
if err != nil {
return nil, err
}
flock := wrlck
err = syscall.FcntlFlock(f.Fd(), F_OFD_SETLKW, &flock)
if err != nil {
f.Close()
return nil, err
}
return &LockedFile{f}, err
}

View File

@ -0,0 +1,45 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fileutil
import (
"os"
"syscall"
"time"
)
func TryLockFile(path string, flag int, perm os.FileMode) (*LockedFile, error) {
if err := os.Chmod(path, syscall.DMEXCL|PrivateFileMode); err != nil {
return nil, err
}
f, err := os.Open(path, flag, perm)
if err != nil {
return nil, ErrLocked
}
return &LockedFile{f}, nil
}
func LockFile(path string, flag int, perm os.FileMode) (*LockedFile, error) {
if err := os.Chmod(path, syscall.DMEXCL|PrivateFileMode); err != nil {
return nil, err
}
for {
f, err := os.OpenFile(path, flag, perm)
if err == nil {
return &LockedFile{f}, nil
}
time.Sleep(10 * time.Millisecond)
}
}

View File

@ -0,0 +1,62 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build solaris
package fileutil
import (
"os"
"syscall"
)
func TryLockFile(path string, flag int, perm os.FileMode) (*LockedFile, error) {
var lock syscall.Flock_t
lock.Start = 0
lock.Len = 0
lock.Pid = 0
lock.Type = syscall.F_WRLCK
lock.Whence = 0
lock.Pid = 0
f, err := os.OpenFile(path, flag, perm)
if err != nil {
return nil, err
}
if err := syscall.FcntlFlock(f.Fd(), syscall.F_SETLK, &lock); err != nil {
f.Close()
if err == syscall.EAGAIN {
err = ErrLocked
}
return nil, err
}
return &LockedFile{f}, nil
}
func LockFile(path string, flag int, perm os.FileMode) (*LockedFile, error) {
var lock syscall.Flock_t
lock.Start = 0
lock.Len = 0
lock.Pid = 0
lock.Type = syscall.F_WRLCK
lock.Whence = 0
f, err := os.OpenFile(path, flag, perm)
if err != nil {
return nil, err
}
if err = syscall.FcntlFlock(f.Fd(), syscall.F_SETLKW, &lock); err != nil {
f.Close()
return nil, err
}
return &LockedFile{f}, nil
}

View File

@ -0,0 +1,29 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !windows,!plan9,!solaris,!linux
package fileutil
import (
"os"
)
func TryLockFile(path string, flag int, perm os.FileMode) (*LockedFile, error) {
return flockTryLockFile(path, flag, perm)
}
func LockFile(path string, flag int, perm os.FileMode) (*LockedFile, error) {
return flockLockFile(path, flag, perm)
}

View File

@ -0,0 +1,125 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build windows
package fileutil
import (
"errors"
"fmt"
"os"
"syscall"
"unsafe"
)
var (
modkernel32 = syscall.NewLazyDLL("kernel32.dll")
procLockFileEx = modkernel32.NewProc("LockFileEx")
errLocked = errors.New("The process cannot access the file because another process has locked a portion of the file.")
)
const (
// https://msdn.microsoft.com/en-us/library/windows/desktop/aa365203(v=vs.85).aspx
LOCKFILE_EXCLUSIVE_LOCK = 2
LOCKFILE_FAIL_IMMEDIATELY = 1
// see https://msdn.microsoft.com/en-us/library/windows/desktop/ms681382(v=vs.85).aspx
errLockViolation syscall.Errno = 0x21
)
func TryLockFile(path string, flag int, perm os.FileMode) (*LockedFile, error) {
f, err := open(path, flag, perm)
if err != nil {
return nil, err
}
if err := lockFile(syscall.Handle(f.Fd()), LOCKFILE_FAIL_IMMEDIATELY); err != nil {
f.Close()
return nil, err
}
return &LockedFile{f}, nil
}
func LockFile(path string, flag int, perm os.FileMode) (*LockedFile, error) {
f, err := open(path, flag, perm)
if err != nil {
return nil, err
}
if err := lockFile(syscall.Handle(f.Fd()), 0); err != nil {
f.Close()
return nil, err
}
return &LockedFile{f}, nil
}
func open(path string, flag int, perm os.FileMode) (*os.File, error) {
if path == "" {
return nil, fmt.Errorf("cannot open empty filename")
}
var access uint32
switch flag {
case syscall.O_RDONLY:
access = syscall.GENERIC_READ
case syscall.O_WRONLY:
access = syscall.GENERIC_WRITE
case syscall.O_RDWR:
access = syscall.GENERIC_READ | syscall.GENERIC_WRITE
case syscall.O_WRONLY | syscall.O_CREAT:
access = syscall.GENERIC_ALL
default:
panic(fmt.Errorf("flag %v is not supported", flag))
}
fd, err := syscall.CreateFile(&(syscall.StringToUTF16(path)[0]),
access,
syscall.FILE_SHARE_READ|syscall.FILE_SHARE_WRITE|syscall.FILE_SHARE_DELETE,
nil,
syscall.OPEN_ALWAYS,
syscall.FILE_ATTRIBUTE_NORMAL,
0)
if err != nil {
return nil, err
}
return os.NewFile(uintptr(fd), path), nil
}
func lockFile(fd syscall.Handle, flags uint32) error {
var flag uint32 = LOCKFILE_EXCLUSIVE_LOCK
flag |= flags
if fd == syscall.InvalidHandle {
return nil
}
err := lockFileEx(fd, flag, 1, 0, &syscall.Overlapped{})
if err == nil {
return nil
} else if err.Error() == errLocked.Error() {
return ErrLocked
} else if err != errLockViolation {
return err
}
return nil
}
func lockFileEx(h syscall.Handle, flags, locklow, lockhigh uint32, ol *syscall.Overlapped) (err error) {
var reserved uint32 = 0
r1, _, e1 := syscall.Syscall6(procLockFileEx.Addr(), 6, uintptr(h), uintptr(flags), uintptr(reserved), uintptr(locklow), uintptr(lockhigh), uintptr(unsafe.Pointer(ol)))
if r1 == 0 {
if e1 != 0 {
err = error(e1)
} else {
err = syscall.EINVAL
}
}
return err
}

View File

@ -0,0 +1,54 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fileutil
import (
"io"
"os"
)
// Preallocate tries to allocate the space for given
// file. This operation is only supported on linux by a
// few filesystems (btrfs, ext4, etc.).
// If the operation is unsupported, no error will be returned.
// Otherwise, the error encountered will be returned.
func Preallocate(f *os.File, sizeInBytes int64, extendFile bool) error {
if sizeInBytes == 0 {
// fallocate will return EINVAL if length is 0; skip
return nil
}
if extendFile {
return preallocExtend(f, sizeInBytes)
}
return preallocFixed(f, sizeInBytes)
}
func preallocExtendTrunc(f *os.File, sizeInBytes int64) error {
curOff, err := f.Seek(0, io.SeekCurrent)
if err != nil {
return err
}
size, err := f.Seek(sizeInBytes, io.SeekEnd)
if err != nil {
return err
}
if _, err = f.Seek(curOff, io.SeekStart); err != nil {
return err
}
if sizeInBytes > size {
return nil
}
return f.Truncate(sizeInBytes)
}

View File

@ -0,0 +1,65 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build darwin
package fileutil
import (
"os"
"syscall"
"unsafe"
)
func preallocExtend(f *os.File, sizeInBytes int64) error {
if err := preallocFixed(f, sizeInBytes); err != nil {
return err
}
return preallocExtendTrunc(f, sizeInBytes)
}
func preallocFixed(f *os.File, sizeInBytes int64) error {
// allocate all requested space or no space at all
// TODO: allocate contiguous space on disk with F_ALLOCATECONTIG flag
fstore := &syscall.Fstore_t{
Flags: syscall.F_ALLOCATEALL,
Posmode: syscall.F_PEOFPOSMODE,
Length: sizeInBytes}
p := unsafe.Pointer(fstore)
_, _, errno := syscall.Syscall(syscall.SYS_FCNTL, f.Fd(), uintptr(syscall.F_PREALLOCATE), uintptr(p))
if errno == 0 || errno == syscall.ENOTSUP {
return nil
}
// wrong argument to fallocate syscall
if errno == syscall.EINVAL {
// filesystem "st_blocks" are allocated in the units of
// "Allocation Block Size" (run "diskutil info /" command)
var stat syscall.Stat_t
syscall.Fstat(int(f.Fd()), &stat)
// syscall.Statfs_t.Bsize is "optimal transfer block size"
// and contains matching 4096 value when latest OS X kernel
// supports 4,096 KB filesystem block size
var statfs syscall.Statfs_t
syscall.Fstatfs(int(f.Fd()), &statfs)
blockSize := int64(statfs.Bsize)
if stat.Blocks*blockSize >= sizeInBytes {
// enough blocks are already allocated
return nil
}
}
return errno
}

View File

@ -0,0 +1,49 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build linux
package fileutil
import (
"os"
"syscall"
)
func preallocExtend(f *os.File, sizeInBytes int64) error {
// use mode = 0 to change size
err := syscall.Fallocate(int(f.Fd()), 0, 0, sizeInBytes)
if err != nil {
errno, ok := err.(syscall.Errno)
// not supported; fallback
// fallocate EINTRs frequently in some environments; fallback
if ok && (errno == syscall.ENOTSUP || errno == syscall.EINTR) {
return preallocExtendTrunc(f, sizeInBytes)
}
}
return err
}
func preallocFixed(f *os.File, sizeInBytes int64) error {
// use mode = 1 to keep size; see FALLOC_FL_KEEP_SIZE
err := syscall.Fallocate(int(f.Fd()), 1, 0, sizeInBytes)
if err != nil {
errno, ok := err.(syscall.Errno)
// treat not supported as nil error
if ok && errno == syscall.ENOTSUP {
return nil
}
}
return err
}

View File

@ -0,0 +1,25 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !linux,!darwin
package fileutil
import "os"
func preallocExtend(f *os.File, sizeInBytes int64) error {
return preallocExtendTrunc(f, sizeInBytes)
}
func preallocFixed(f *os.File, sizeInBytes int64) error { return nil }

88
vendor/github.com/coreos/etcd/pkg/fileutil/purge.go generated vendored Normal file
View File

@ -0,0 +1,88 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fileutil
import (
"os"
"path/filepath"
"sort"
"strings"
"time"
)
func PurgeFile(dirname string, suffix string, max uint, interval time.Duration, stop <-chan struct{}) <-chan error {
return purgeFile(dirname, suffix, max, interval, stop, nil, nil)
}
func PurgeFileWithDoneNotify(dirname string, suffix string, max uint, interval time.Duration, stop <-chan struct{}) (<-chan struct{}, <-chan error) {
doneC := make(chan struct{})
errC := purgeFile(dirname, suffix, max, interval, stop, nil, doneC)
return doneC, errC
}
// purgeFile is the internal implementation for PurgeFile which can post purged files to purgec if non-nil.
// if donec is non-nil, the function closes it to notify its exit.
func purgeFile(dirname string, suffix string, max uint, interval time.Duration, stop <-chan struct{}, purgec chan<- string, donec chan<- struct{}) <-chan error {
errC := make(chan error, 1)
go func() {
if donec != nil {
defer close(donec)
}
for {
fnames, err := ReadDir(dirname)
if err != nil {
errC <- err
return
}
newfnames := make([]string, 0)
for _, fname := range fnames {
if strings.HasSuffix(fname, suffix) {
newfnames = append(newfnames, fname)
}
}
sort.Strings(newfnames)
fnames = newfnames
for len(newfnames) > int(max) {
f := filepath.Join(dirname, newfnames[0])
l, err := TryLockFile(f, os.O_WRONLY, PrivateFileMode)
if err != nil {
break
}
if err = os.Remove(f); err != nil {
errC <- err
return
}
if err = l.Close(); err != nil {
plog.Errorf("error unlocking %s when purging file (%v)", l.Name(), err)
errC <- err
return
}
plog.Infof("purged file %s successfully", f)
newfnames = newfnames[1:]
}
if purgec != nil {
for i := 0; i < len(fnames)-len(newfnames); i++ {
purgec <- fnames[i]
}
}
select {
case <-time.After(interval):
case <-stop:
return
}
}
}()
return errC
}

29
vendor/github.com/coreos/etcd/pkg/fileutil/sync.go generated vendored Normal file
View File

@ -0,0 +1,29 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !linux,!darwin
package fileutil
import "os"
// Fsync is a wrapper around file.Sync(). Special handling is needed on darwin platform.
func Fsync(f *os.File) error {
return f.Sync()
}
// Fdatasync is a wrapper around file.Sync(). Special handling is needed on linux platform.
func Fdatasync(f *os.File) error {
return f.Sync()
}

View File

@ -0,0 +1,40 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build darwin
package fileutil
import (
"os"
"syscall"
)
// Fsync on HFS/OSX flushes the data on to the physical drive but the drive
// may not write it to the persistent media for quite sometime and it may be
// written in out-of-order sequence. Using F_FULLFSYNC ensures that the
// physical drive's buffer will also get flushed to the media.
func Fsync(f *os.File) error {
_, _, errno := syscall.Syscall(syscall.SYS_FCNTL, f.Fd(), uintptr(syscall.F_FULLFSYNC), uintptr(0))
if errno == 0 {
return nil
}
return errno
}
// Fdatasync on darwin platform invokes fcntl(F_FULLFSYNC) for actual persistence
// on physical drive media.
func Fdatasync(f *os.File) error {
return Fsync(f)
}

View File

@ -0,0 +1,34 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build linux
package fileutil
import (
"os"
"syscall"
)
// Fsync is a wrapper around file.Sync(). Special handling is needed on darwin platform.
func Fsync(f *os.File) error {
return f.Sync()
}
// Fdatasync is similar to fsync(), but does not flush modified metadata
// unless that metadata is needed in order to allow a subsequent data retrieval
// to be correctly handled.
func Fdatasync(f *os.File) error {
return syscall.Fdatasync(int(f.Fd()))
}

191
vendor/github.com/coreos/go-systemd/LICENSE generated vendored Normal file
View File

@ -0,0 +1,191 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction, and
distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by the copyright
owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all other entities
that control, are controlled by, or are under common control with that entity.
For the purposes of this definition, "control" means (i) the power, direct or
indirect, to cause the direction or management of such entity, whether by
contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity exercising
permissions granted by this License.
"Source" form shall mean the preferred form for making modifications, including
but not limited to software source code, documentation source, and configuration
files.
"Object" form shall mean any form resulting from mechanical transformation or
translation of a Source form, including but not limited to compiled object code,
generated documentation, and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or Object form, made
available under the License, as indicated by a copyright notice that is included
in or attached to the work (an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object form, that
is based on (or derived from) the Work and for which the editorial revisions,
annotations, elaborations, or other modifications represent, as a whole, an
original work of authorship. For the purposes of this License, Derivative Works
shall not include works that remain separable from, or merely link (or bind by
name) to the interfaces of, the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including the original version
of the Work and any modifications or additions to that Work or Derivative Works
thereof, that is intentionally submitted to Licensor for inclusion in the Work
by the copyright owner or by an individual or Legal Entity authorized to submit
on behalf of the copyright owner. For the purposes of this definition,
"submitted" means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems, and
issue tracking systems that are managed by, or on behalf of, the Licensor for
the purpose of discussing and improving the Work, but excluding communication
that is conspicuously marked or otherwise designated in writing by the copyright
owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity on behalf
of whom a Contribution has been received by Licensor and subsequently
incorporated within the Work.
2. Grant of Copyright License.
Subject to the terms and conditions of this License, each Contributor hereby
grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
irrevocable copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the Work and such
Derivative Works in Source or Object form.
3. Grant of Patent License.
Subject to the terms and conditions of this License, each Contributor hereby
grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to make, have
made, use, offer to sell, sell, import, and otherwise transfer the Work, where
such license applies only to those patent claims licensable by such Contributor
that are necessarily infringed by their Contribution(s) alone or by combination
of their Contribution(s) with the Work to which such Contribution(s) was
submitted. If You institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work or a
Contribution incorporated within the Work constitutes direct or contributory
patent infringement, then any patent licenses granted to You under this License
for that Work shall terminate as of the date such litigation is filed.
4. Redistribution.
You may reproduce and distribute copies of the Work or Derivative Works thereof
in any medium, with or without modifications, and in Source or Object form,
provided that You meet the following conditions:
You must give any other recipients of the Work or Derivative Works a copy of
this License; and
You must cause any modified files to carry prominent notices stating that You
changed the files; and
You must retain, in the Source form of any Derivative Works that You distribute,
all copyright, patent, trademark, and attribution notices from the Source form
of the Work, excluding those notices that do not pertain to any part of the
Derivative Works; and
If the Work includes a "NOTICE" text file as part of its distribution, then any
Derivative Works that You distribute must include a readable copy of the
attribution notices contained within such NOTICE file, excluding those notices
that do not pertain to any part of the Derivative Works, in at least one of the
following places: within a NOTICE text file distributed as part of the
Derivative Works; within the Source form or documentation, if provided along
with the Derivative Works; or, within a display generated by the Derivative
Works, if and wherever such third-party notices normally appear. The contents of
the NOTICE file are for informational purposes only and do not modify the
License. You may add Your own attribution notices within Derivative Works that
You distribute, alongside or as an addendum to the NOTICE text from the Work,
provided that such additional attribution notices cannot be construed as
modifying the License.
You may add Your own copyright statement to Your modifications and may provide
additional or different license terms and conditions for use, reproduction, or
distribution of Your modifications, or for any such Derivative Works as a whole,
provided Your use, reproduction, and distribution of the Work otherwise complies
with the conditions stated in this License.
5. Submission of Contributions.
Unless You explicitly state otherwise, any Contribution intentionally submitted
for inclusion in the Work by You to the Licensor shall be under the terms and
conditions of this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify the terms of
any separate license agreement you may have executed with Licensor regarding
such Contributions.
6. Trademarks.
This License does not grant permission to use the trade names, trademarks,
service marks, or product names of the Licensor, except as required for
reasonable and customary use in describing the origin of the Work and
reproducing the content of the NOTICE file.
7. Disclaimer of Warranty.
Unless required by applicable law or agreed to in writing, Licensor provides the
Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
including, without limitation, any warranties or conditions of TITLE,
NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
solely responsible for determining the appropriateness of using or
redistributing the Work and assume any risks associated with Your exercise of
permissions under this License.
8. Limitation of Liability.
In no event and under no legal theory, whether in tort (including negligence),
contract, or otherwise, unless required by applicable law (such as deliberate
and grossly negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special, incidental,
or consequential damages of any character arising as a result of this License or
out of the use or inability to use the Work (including but not limited to
damages for loss of goodwill, work stoppage, computer failure or malfunction, or
any and all other commercial damages or losses), even if such Contributor has
been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability.
While redistributing the Work or Derivative Works thereof, You may choose to
offer, and charge a fee for, acceptance of support, warranty, indemnity, or
other liability obligations and/or rights consistent with this License. However,
in accepting such obligations, You may act only on Your own behalf and on Your
sole responsibility, not on behalf of any other Contributor, and only if You
agree to indemnify, defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason of your
accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work
To apply the Apache License to your work, attach the following boilerplate
notice, with the fields enclosed by brackets "[]" replaced with your own
identifying information. (Don't include the brackets!) The text should be
enclosed in the appropriate comment syntax for the file format. We also
recommend that a file or class name and description of purpose be included on
the same "printed page" as the copyright notice for easier identification within
third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

225
vendor/github.com/coreos/go-systemd/journal/journal.go generated vendored Normal file
View File

@ -0,0 +1,225 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package journal provides write bindings to the local systemd journal.
// It is implemented in pure Go and connects to the journal directly over its
// unix socket.
//
// To read from the journal, see the "sdjournal" package, which wraps the
// sd-journal a C API.
//
// http://www.freedesktop.org/software/systemd/man/systemd-journald.service.html
package journal
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"io/ioutil"
"net"
"os"
"strconv"
"strings"
"sync"
"sync/atomic"
"syscall"
"unsafe"
)
// Priority of a journal message
type Priority int
const (
PriEmerg Priority = iota
PriAlert
PriCrit
PriErr
PriWarning
PriNotice
PriInfo
PriDebug
)
var (
// This can be overridden at build-time:
// https://github.com/golang/go/wiki/GcToolchainTricks#including-build-information-in-the-executable
journalSocket = "/run/systemd/journal/socket"
// unixConnPtr atomically holds the local unconnected Unix-domain socket.
// Concrete safe pointer type: *net.UnixConn
unixConnPtr unsafe.Pointer
// onceConn ensures that unixConnPtr is initialized exactly once.
onceConn sync.Once
)
func init() {
onceConn.Do(initConn)
}
// Enabled checks whether the local systemd journal is available for logging.
func Enabled() bool {
onceConn.Do(initConn)
if (*net.UnixConn)(atomic.LoadPointer(&unixConnPtr)) == nil {
return false
}
if _, err := net.Dial("unixgram", journalSocket); err != nil {
return false
}
return true
}
// Send a message to the local systemd journal. vars is a map of journald
// fields to values. Fields must be composed of uppercase letters, numbers,
// and underscores, but must not start with an underscore. Within these
// restrictions, any arbitrary field name may be used. Some names have special
// significance: see the journalctl documentation
// (http://www.freedesktop.org/software/systemd/man/systemd.journal-fields.html)
// for more details. vars may be nil.
func Send(message string, priority Priority, vars map[string]string) error {
conn := (*net.UnixConn)(atomic.LoadPointer(&unixConnPtr))
if conn == nil {
return errors.New("could not initialize socket to journald")
}
socketAddr := &net.UnixAddr{
Name: journalSocket,
Net: "unixgram",
}
data := new(bytes.Buffer)
appendVariable(data, "PRIORITY", strconv.Itoa(int(priority)))
appendVariable(data, "MESSAGE", message)
for k, v := range vars {
appendVariable(data, k, v)
}
_, _, err := conn.WriteMsgUnix(data.Bytes(), nil, socketAddr)
if err == nil {
return nil
}
if !isSocketSpaceError(err) {
return err
}
// Large log entry, send it via tempfile and ancillary-fd.
file, err := tempFd()
if err != nil {
return err
}
defer file.Close()
_, err = io.Copy(file, data)
if err != nil {
return err
}
rights := syscall.UnixRights(int(file.Fd()))
_, _, err = conn.WriteMsgUnix([]byte{}, rights, socketAddr)
if err != nil {
return err
}
return nil
}
// Print prints a message to the local systemd journal using Send().
func Print(priority Priority, format string, a ...interface{}) error {
return Send(fmt.Sprintf(format, a...), priority, nil)
}
func appendVariable(w io.Writer, name, value string) {
if err := validVarName(name); err != nil {
fmt.Fprintf(os.Stderr, "variable name %s contains invalid character, ignoring\n", name)
}
if strings.ContainsRune(value, '\n') {
/* When the value contains a newline, we write:
* - the variable name, followed by a newline
* - the size (in 64bit little endian format)
* - the data, followed by a newline
*/
fmt.Fprintln(w, name)
binary.Write(w, binary.LittleEndian, uint64(len(value)))
fmt.Fprintln(w, value)
} else {
/* just write the variable and value all on one line */
fmt.Fprintf(w, "%s=%s\n", name, value)
}
}
// validVarName validates a variable name to make sure journald will accept it.
// The variable name must be in uppercase and consist only of characters,
// numbers and underscores, and may not begin with an underscore:
// https://www.freedesktop.org/software/systemd/man/sd_journal_print.html
func validVarName(name string) error {
if name == "" {
return errors.New("Empty variable name")
} else if name[0] == '_' {
return errors.New("Variable name begins with an underscore")
}
for _, c := range name {
if !(('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || c == '_') {
return errors.New("Variable name contains invalid characters")
}
}
return nil
}
// isSocketSpaceError checks whether the error is signaling
// an "overlarge message" condition.
func isSocketSpaceError(err error) bool {
opErr, ok := err.(*net.OpError)
if !ok || opErr == nil {
return false
}
sysErr, ok := opErr.Err.(*os.SyscallError)
if !ok || sysErr == nil {
return false
}
return sysErr.Err == syscall.EMSGSIZE || sysErr.Err == syscall.ENOBUFS
}
// tempFd creates a temporary, unlinked file under `/dev/shm`.
func tempFd() (*os.File, error) {
file, err := ioutil.TempFile("/dev/shm/", "journal.XXXXX")
if err != nil {
return nil, err
}
err = syscall.Unlink(file.Name())
if err != nil {
return nil, err
}
return file, nil
}
// initConn initializes the global `unixConnPtr` socket.
// It is meant to be called exactly once, at program startup.
func initConn() {
autobind, err := net.ResolveUnixAddr("unixgram", "")
if err != nil {
return
}
sock, err := net.ListenUnixgram("unixgram", autobind)
if err != nil {
return
}
atomic.StorePointer(&unixConnPtr, unsafe.Pointer(sock))
}

202
vendor/github.com/coreos/pkg/LICENSE generated vendored Normal file
View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright {yyyy} {name of copyright owner}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

39
vendor/github.com/coreos/pkg/capnslog/README.md generated vendored Normal file
View File

@ -0,0 +1,39 @@
# capnslog, the CoreOS logging package
There are far too many logging packages out there, with varying degrees of licenses, far too many features (colorization, all sorts of log frameworks) or are just a pain to use (lack of `Fatalln()`?).
capnslog provides a simple but consistent logging interface suitable for all kinds of projects.
### Design Principles
##### `package main` is the place where logging gets turned on and routed
A library should not touch log options, only generate log entries. Libraries are silent until main lets them speak.
##### All log options are runtime-configurable.
Still the job of `main` to expose these configurations. `main` may delegate this to, say, a configuration webhook, but does so explicitly.
##### There is one log object per package. It is registered under its repository and package name.
`main` activates logging for its repository and any dependency repositories it would also like to have output in its logstream. `main` also dictates at which level each subpackage logs.
##### There is *one* output stream, and it is an `io.Writer` composed with a formatter.
Splitting streams is probably not the job of your program, but rather, your log aggregation framework. If you must split output streams, again, `main` configures this and you can write a very simple two-output struct that satisfies io.Writer.
Fancy colorful formatting and JSON output are beyond the scope of a basic logging framework -- they're application/log-collector dependent. These are, at best, provided as options, but more likely, provided by your application.
##### Log objects are an interface
An object knows best how to print itself. Log objects can collect more interesting metadata if they wish, however, because text isn't going away anytime soon, they must all be marshalable to text. The simplest log object is a string, which returns itself. If you wish to do more fancy tricks for printing your log objects, see also JSON output -- introspect and write a formatter which can handle your advanced log interface. Making strings is the only thing guaranteed.
##### Log levels have specific meanings:
* Critical: Unrecoverable. Must fail.
* Error: Data has been lost, a request has failed for a bad reason, or a required resource has been lost
* Warning: (Hopefully) Temporary conditions that may cause errors, but may work fine. A replica disappearing (that may reconnect) is a warning.
* Notice: Normal, but important (uncommon) log information.
* Info: Normal, working log information, everything is fine, but helpful notices for auditing or common operations.
* Debug: Everything is still fine, but even common operations may be logged, and less helpful but more quantity of notices.
* Trace: Anything goes, from logging every function call as part of a common operation, to tracing execution of a query.

157
vendor/github.com/coreos/pkg/capnslog/formatters.go generated vendored Normal file
View File

@ -0,0 +1,157 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package capnslog
import (
"bufio"
"fmt"
"io"
"log"
"runtime"
"strings"
"time"
)
type Formatter interface {
Format(pkg string, level LogLevel, depth int, entries ...interface{})
Flush()
}
func NewStringFormatter(w io.Writer) Formatter {
return &StringFormatter{
w: bufio.NewWriter(w),
}
}
type StringFormatter struct {
w *bufio.Writer
}
func (s *StringFormatter) Format(pkg string, l LogLevel, i int, entries ...interface{}) {
now := time.Now().UTC()
s.w.WriteString(now.Format(time.RFC3339))
s.w.WriteByte(' ')
writeEntries(s.w, pkg, l, i, entries...)
s.Flush()
}
func writeEntries(w *bufio.Writer, pkg string, _ LogLevel, _ int, entries ...interface{}) {
if pkg != "" {
w.WriteString(pkg + ": ")
}
str := fmt.Sprint(entries...)
endsInNL := strings.HasSuffix(str, "\n")
w.WriteString(str)
if !endsInNL {
w.WriteString("\n")
}
}
func (s *StringFormatter) Flush() {
s.w.Flush()
}
func NewPrettyFormatter(w io.Writer, debug bool) Formatter {
return &PrettyFormatter{
w: bufio.NewWriter(w),
debug: debug,
}
}
type PrettyFormatter struct {
w *bufio.Writer
debug bool
}
func (c *PrettyFormatter) Format(pkg string, l LogLevel, depth int, entries ...interface{}) {
now := time.Now()
ts := now.Format("2006-01-02 15:04:05")
c.w.WriteString(ts)
ms := now.Nanosecond() / 1000
c.w.WriteString(fmt.Sprintf(".%06d", ms))
if c.debug {
_, file, line, ok := runtime.Caller(depth) // It's always the same number of frames to the user's call.
if !ok {
file = "???"
line = 1
} else {
slash := strings.LastIndex(file, "/")
if slash >= 0 {
file = file[slash+1:]
}
}
if line < 0 {
line = 0 // not a real line number
}
c.w.WriteString(fmt.Sprintf(" [%s:%d]", file, line))
}
c.w.WriteString(fmt.Sprint(" ", l.Char(), " | "))
writeEntries(c.w, pkg, l, depth, entries...)
c.Flush()
}
func (c *PrettyFormatter) Flush() {
c.w.Flush()
}
// LogFormatter emulates the form of the traditional built-in logger.
type LogFormatter struct {
logger *log.Logger
prefix string
}
// NewLogFormatter is a helper to produce a new LogFormatter struct. It uses the
// golang log package to actually do the logging work so that logs look similar.
func NewLogFormatter(w io.Writer, prefix string, flag int) Formatter {
return &LogFormatter{
logger: log.New(w, "", flag), // don't use prefix here
prefix: prefix, // save it instead
}
}
// Format builds a log message for the LogFormatter. The LogLevel is ignored.
func (lf *LogFormatter) Format(pkg string, _ LogLevel, _ int, entries ...interface{}) {
str := fmt.Sprint(entries...)
prefix := lf.prefix
if pkg != "" {
prefix = fmt.Sprintf("%s%s: ", prefix, pkg)
}
lf.logger.Output(5, fmt.Sprintf("%s%v", prefix, str)) // call depth is 5
}
// Flush is included so that the interface is complete, but is a no-op.
func (lf *LogFormatter) Flush() {
// noop
}
// NilFormatter is a no-op log formatter that does nothing.
type NilFormatter struct {
}
// NewNilFormatter is a helper to produce a new LogFormatter struct. It logs no
// messages so that you can cause part of your logging to be silent.
func NewNilFormatter() Formatter {
return &NilFormatter{}
}
// Format does nothing.
func (_ *NilFormatter) Format(_ string, _ LogLevel, _ int, _ ...interface{}) {
// noop
}
// Flush is included so that the interface is complete, but is a no-op.
func (_ *NilFormatter) Flush() {
// noop
}

View File

@ -0,0 +1,96 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package capnslog
import (
"bufio"
"bytes"
"io"
"os"
"runtime"
"strconv"
"strings"
"time"
)
var pid = os.Getpid()
type GlogFormatter struct {
StringFormatter
}
func NewGlogFormatter(w io.Writer) *GlogFormatter {
g := &GlogFormatter{}
g.w = bufio.NewWriter(w)
return g
}
func (g GlogFormatter) Format(pkg string, level LogLevel, depth int, entries ...interface{}) {
g.w.Write(GlogHeader(level, depth+1))
g.StringFormatter.Format(pkg, level, depth+1, entries...)
}
func GlogHeader(level LogLevel, depth int) []byte {
// Lmmdd hh:mm:ss.uuuuuu threadid file:line]
now := time.Now().UTC()
_, file, line, ok := runtime.Caller(depth) // It's always the same number of frames to the user's call.
if !ok {
file = "???"
line = 1
} else {
slash := strings.LastIndex(file, "/")
if slash >= 0 {
file = file[slash+1:]
}
}
if line < 0 {
line = 0 // not a real line number
}
buf := &bytes.Buffer{}
buf.Grow(30)
_, month, day := now.Date()
hour, minute, second := now.Clock()
buf.WriteString(level.Char())
twoDigits(buf, int(month))
twoDigits(buf, day)
buf.WriteByte(' ')
twoDigits(buf, hour)
buf.WriteByte(':')
twoDigits(buf, minute)
buf.WriteByte(':')
twoDigits(buf, second)
buf.WriteByte('.')
buf.WriteString(strconv.Itoa(now.Nanosecond() / 1000))
buf.WriteByte('Z')
buf.WriteByte(' ')
buf.WriteString(strconv.Itoa(pid))
buf.WriteByte(' ')
buf.WriteString(file)
buf.WriteByte(':')
buf.WriteString(strconv.Itoa(line))
buf.WriteByte(']')
buf.WriteByte(' ')
return buf.Bytes()
}
const digits = "0123456789"
func twoDigits(b *bytes.Buffer, d int) {
c2 := digits[d%10]
d /= 10
c1 := digits[d%10]
b.WriteByte(c1)
b.WriteByte(c2)
}

50
vendor/github.com/coreos/pkg/capnslog/init.go generated vendored Normal file
View File

@ -0,0 +1,50 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//go:build !windows
// +build !windows
package capnslog
import (
"io"
"os"
"syscall"
)
// Here's where the opinionation comes in. We need some sensible defaults,
// especially after taking over the log package. Your project (whatever it may
// be) may see things differently. That's okay; there should be no defaults in
// the main package that cannot be controlled or overridden programatically,
// otherwise it's a bug. Doing so is creating your own init_log.go file much
// like this one.
func init() {
initHijack()
// Go `log` package uses os.Stderr.
SetFormatter(NewDefaultFormatter(os.Stderr))
SetGlobalLogLevel(INFO)
}
func NewDefaultFormatter(out io.Writer) Formatter {
if syscall.Getppid() == 1 {
// We're running under init, which may be systemd.
f, err := NewJournaldFormatter()
if err == nil {
return f
}
}
return NewPrettyFormatter(out, false)
}

25
vendor/github.com/coreos/pkg/capnslog/init_windows.go generated vendored Normal file
View File

@ -0,0 +1,25 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package capnslog
import "os"
func init() {
initHijack()
// Go `log` package uses os.Stderr.
SetFormatter(NewPrettyFormatter(os.Stderr, false))
SetGlobalLogLevel(INFO)
}

View File

@ -0,0 +1,69 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//go:build !windows
// +build !windows
package capnslog
import (
"errors"
"fmt"
"os"
"path/filepath"
"github.com/coreos/go-systemd/journal"
)
func NewJournaldFormatter() (Formatter, error) {
if !journal.Enabled() {
return nil, errors.New("No systemd detected")
}
return &journaldFormatter{}, nil
}
type journaldFormatter struct{}
func (j *journaldFormatter) Format(pkg string, l LogLevel, _ int, entries ...interface{}) {
var pri journal.Priority
switch l {
case CRITICAL:
pri = journal.PriCrit
case ERROR:
pri = journal.PriErr
case WARNING:
pri = journal.PriWarning
case NOTICE:
pri = journal.PriNotice
case INFO:
pri = journal.PriInfo
case DEBUG:
pri = journal.PriDebug
case TRACE:
pri = journal.PriDebug
default:
panic("Unhandled loglevel")
}
msg := fmt.Sprint(entries...)
tags := map[string]string{
"PACKAGE": pkg,
"SYSLOG_IDENTIFIER": filepath.Base(os.Args[0]),
}
err := journal.Send(msg, pri, tags)
if err != nil {
fmt.Fprintln(os.Stderr, err)
}
}
func (j *journaldFormatter) Flush() {}

39
vendor/github.com/coreos/pkg/capnslog/log_hijack.go generated vendored Normal file
View File

@ -0,0 +1,39 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package capnslog
import (
"log"
)
func initHijack() {
pkg := NewPackageLogger("log", "")
w := packageWriter{pkg}
log.SetFlags(0)
log.SetPrefix("")
log.SetOutput(w)
}
type packageWriter struct {
pl *PackageLogger
}
func (p packageWriter) Write(b []byte) (int, error) {
if p.pl.level < INFO {
return 0, nil
}
p.pl.internalLog(calldepth+2, INFO, string(b))
return len(b), nil
}

245
vendor/github.com/coreos/pkg/capnslog/logmap.go generated vendored Normal file
View File

@ -0,0 +1,245 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package capnslog
import (
"errors"
"strings"
"sync"
)
// LogLevel is the set of all log levels.
type LogLevel int8
const (
// CRITICAL is the lowest log level; only errors which will end the program will be propagated.
CRITICAL LogLevel = iota - 1
// ERROR is for errors that are not fatal but lead to troubling behavior.
ERROR
// WARNING is for errors which are not fatal and not errors, but are unusual. Often sourced from misconfigurations.
WARNING
// NOTICE is for normal but significant conditions.
NOTICE
// INFO is a log level for common, everyday log updates.
INFO
// DEBUG is the default hidden level for more verbose updates about internal processes.
DEBUG
// TRACE is for (potentially) call by call tracing of programs.
TRACE
)
// Char returns a single-character representation of the log level.
func (l LogLevel) Char() string {
switch l {
case CRITICAL:
return "C"
case ERROR:
return "E"
case WARNING:
return "W"
case NOTICE:
return "N"
case INFO:
return "I"
case DEBUG:
return "D"
case TRACE:
return "T"
default:
panic("Unhandled loglevel")
}
}
// String returns a multi-character representation of the log level.
func (l LogLevel) String() string {
switch l {
case CRITICAL:
return "CRITICAL"
case ERROR:
return "ERROR"
case WARNING:
return "WARNING"
case NOTICE:
return "NOTICE"
case INFO:
return "INFO"
case DEBUG:
return "DEBUG"
case TRACE:
return "TRACE"
default:
panic("Unhandled loglevel")
}
}
// Update using the given string value. Fulfills the flag.Value interface.
func (l *LogLevel) Set(s string) error {
value, err := ParseLevel(s)
if err != nil {
return err
}
*l = value
return nil
}
// Returns an empty string, only here to fulfill the pflag.Value interface.
func (l *LogLevel) Type() string {
return ""
}
// ParseLevel translates some potential loglevel strings into their corresponding levels.
func ParseLevel(s string) (LogLevel, error) {
switch s {
case "CRITICAL", "C":
return CRITICAL, nil
case "ERROR", "0", "E":
return ERROR, nil
case "WARNING", "1", "W":
return WARNING, nil
case "NOTICE", "2", "N":
return NOTICE, nil
case "INFO", "3", "I":
return INFO, nil
case "DEBUG", "4", "D":
return DEBUG, nil
case "TRACE", "5", "T":
return TRACE, nil
}
return CRITICAL, errors.New("couldn't parse log level " + s)
}
type RepoLogger map[string]*PackageLogger
type loggerStruct struct {
sync.Mutex
repoMap map[string]RepoLogger
formatter Formatter
}
// logger is the global logger
var logger = new(loggerStruct)
// SetGlobalLogLevel sets the log level for all packages in all repositories
// registered with capnslog.
func SetGlobalLogLevel(l LogLevel) {
logger.Lock()
defer logger.Unlock()
for _, r := range logger.repoMap {
r.setRepoLogLevelInternal(l)
}
}
// GetRepoLogger may return the handle to the repository's set of packages' loggers.
func GetRepoLogger(repo string) (RepoLogger, error) {
logger.Lock()
defer logger.Unlock()
r, ok := logger.repoMap[repo]
if !ok {
return nil, errors.New("no packages registered for repo " + repo)
}
return r, nil
}
// MustRepoLogger returns the handle to the repository's packages' loggers.
func MustRepoLogger(repo string) RepoLogger {
r, err := GetRepoLogger(repo)
if err != nil {
panic(err)
}
return r
}
// SetRepoLogLevel sets the log level for all packages in the repository.
func (r RepoLogger) SetRepoLogLevel(l LogLevel) {
logger.Lock()
defer logger.Unlock()
r.setRepoLogLevelInternal(l)
}
func (r RepoLogger) setRepoLogLevelInternal(l LogLevel) {
for _, v := range r {
v.level = l
}
}
// ParseLogLevelConfig parses a comma-separated string of "package=loglevel", in
// order, and returns a map of the results, for use in SetLogLevel.
func (r RepoLogger) ParseLogLevelConfig(conf string) (map[string]LogLevel, error) {
setlist := strings.Split(conf, ",")
out := make(map[string]LogLevel)
for _, setstring := range setlist {
setting := strings.Split(setstring, "=")
if len(setting) != 2 {
return nil, errors.New("oddly structured `pkg=level` option: " + setstring)
}
l, err := ParseLevel(setting[1])
if err != nil {
return nil, err
}
out[setting[0]] = l
}
return out, nil
}
// SetLogLevel takes a map of package names within a repository to their desired
// loglevel, and sets the levels appropriately. Unknown packages are ignored.
// "*" is a special package name that corresponds to all packages, and will be
// processed first.
func (r RepoLogger) SetLogLevel(m map[string]LogLevel) {
logger.Lock()
defer logger.Unlock()
if l, ok := m["*"]; ok {
r.setRepoLogLevelInternal(l)
}
for k, v := range m {
l, ok := r[k]
if !ok {
continue
}
l.level = v
}
}
// SetFormatter sets the formatting function for all logs.
func SetFormatter(f Formatter) {
logger.Lock()
defer logger.Unlock()
logger.formatter = f
}
// NewPackageLogger creates a package logger object.
// This should be defined as a global var in your package, referencing your repo.
func NewPackageLogger(repo string, pkg string) (p *PackageLogger) {
logger.Lock()
defer logger.Unlock()
if logger.repoMap == nil {
logger.repoMap = make(map[string]RepoLogger)
}
r, rok := logger.repoMap[repo]
if !rok {
logger.repoMap[repo] = make(RepoLogger)
r = logger.repoMap[repo]
}
p, pok := r[pkg]
if !pok {
r[pkg] = &PackageLogger{
pkg: pkg,
level: INFO,
}
p = r[pkg]
}
return
}

191
vendor/github.com/coreos/pkg/capnslog/pkg_logger.go generated vendored Normal file
View File

@ -0,0 +1,191 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package capnslog
import (
"fmt"
"os"
)
type PackageLogger struct {
pkg string
level LogLevel
}
const calldepth = 2
func (p *PackageLogger) internalLog(depth int, inLevel LogLevel, entries ...interface{}) {
logger.Lock()
defer logger.Unlock()
if inLevel != CRITICAL && p.level < inLevel {
return
}
if logger.formatter != nil {
logger.formatter.Format(p.pkg, inLevel, depth+1, entries...)
}
}
// SetLevel allows users to change the current logging level.
func (p *PackageLogger) SetLevel(l LogLevel) {
logger.Lock()
defer logger.Unlock()
p.level = l
}
// LevelAt checks if the given log level will be outputted under current setting.
func (p *PackageLogger) LevelAt(l LogLevel) bool {
logger.Lock()
defer logger.Unlock()
return p.level >= l
}
// Log a formatted string at any level between ERROR and TRACE
func (p *PackageLogger) Logf(l LogLevel, format string, args ...interface{}) {
p.internalLog(calldepth, l, fmt.Sprintf(format, args...))
}
// Log a message at any level between ERROR and TRACE
func (p *PackageLogger) Log(l LogLevel, args ...interface{}) {
p.internalLog(calldepth, l, fmt.Sprint(args...))
}
// log stdlib compatibility
func (p *PackageLogger) Println(args ...interface{}) {
p.internalLog(calldepth, INFO, fmt.Sprintln(args...))
}
func (p *PackageLogger) Printf(format string, args ...interface{}) {
p.Logf(INFO, format, args...)
}
func (p *PackageLogger) Print(args ...interface{}) {
p.internalLog(calldepth, INFO, fmt.Sprint(args...))
}
// Panic and fatal
func (p *PackageLogger) Panicf(format string, args ...interface{}) {
s := fmt.Sprintf(format, args...)
p.internalLog(calldepth, CRITICAL, s)
panic(s)
}
func (p *PackageLogger) Panic(args ...interface{}) {
s := fmt.Sprint(args...)
p.internalLog(calldepth, CRITICAL, s)
panic(s)
}
func (p *PackageLogger) Panicln(args ...interface{}) {
s := fmt.Sprintln(args...)
p.internalLog(calldepth, CRITICAL, s)
panic(s)
}
func (p *PackageLogger) Fatalf(format string, args ...interface{}) {
p.Logf(CRITICAL, format, args...)
os.Exit(1)
}
func (p *PackageLogger) Fatal(args ...interface{}) {
s := fmt.Sprint(args...)
p.internalLog(calldepth, CRITICAL, s)
os.Exit(1)
}
func (p *PackageLogger) Fatalln(args ...interface{}) {
s := fmt.Sprintln(args...)
p.internalLog(calldepth, CRITICAL, s)
os.Exit(1)
}
// Error Functions
func (p *PackageLogger) Errorf(format string, args ...interface{}) {
p.Logf(ERROR, format, args...)
}
func (p *PackageLogger) Error(entries ...interface{}) {
p.internalLog(calldepth, ERROR, entries...)
}
// Warning Functions
func (p *PackageLogger) Warningf(format string, args ...interface{}) {
p.Logf(WARNING, format, args...)
}
func (p *PackageLogger) Warning(entries ...interface{}) {
p.internalLog(calldepth, WARNING, entries...)
}
// Notice Functions
func (p *PackageLogger) Noticef(format string, args ...interface{}) {
p.Logf(NOTICE, format, args...)
}
func (p *PackageLogger) Notice(entries ...interface{}) {
p.internalLog(calldepth, NOTICE, entries...)
}
// Info Functions
func (p *PackageLogger) Infof(format string, args ...interface{}) {
p.Logf(INFO, format, args...)
}
func (p *PackageLogger) Info(entries ...interface{}) {
p.internalLog(calldepth, INFO, entries...)
}
// Debug Functions
func (p *PackageLogger) Debugf(format string, args ...interface{}) {
if p.level < DEBUG {
return
}
p.Logf(DEBUG, format, args...)
}
func (p *PackageLogger) Debug(entries ...interface{}) {
if p.level < DEBUG {
return
}
p.internalLog(calldepth, DEBUG, entries...)
}
// Trace Functions
func (p *PackageLogger) Tracef(format string, args ...interface{}) {
if p.level < TRACE {
return
}
p.Logf(TRACE, format, args...)
}
func (p *PackageLogger) Trace(entries ...interface{}) {
if p.level < TRACE {
return
}
p.internalLog(calldepth, TRACE, entries...)
}
func (p *PackageLogger) Flush() {
logger.Lock()
defer logger.Unlock()
logger.formatter.Flush()
}

View File

@ -0,0 +1,66 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//go:build !windows
// +build !windows
package capnslog
import (
"fmt"
"log/syslog"
)
func NewSyslogFormatter(w *syslog.Writer) Formatter {
return &syslogFormatter{w}
}
func NewDefaultSyslogFormatter(tag string) (Formatter, error) {
w, err := syslog.New(syslog.LOG_DEBUG, tag)
if err != nil {
return nil, err
}
return NewSyslogFormatter(w), nil
}
type syslogFormatter struct {
w *syslog.Writer
}
func (s *syslogFormatter) Format(pkg string, l LogLevel, _ int, entries ...interface{}) {
for _, entry := range entries {
str := fmt.Sprint(entry)
switch l {
case CRITICAL:
s.w.Crit(str)
case ERROR:
s.w.Err(str)
case WARNING:
s.w.Warning(str)
case NOTICE:
s.w.Notice(str)
case INFO:
s.w.Info(str)
case DEBUG:
s.w.Debug(str)
case TRACE:
s.w.Debug(str)
default:
panic("Unhandled loglevel")
}
}
}
func (s *syslogFormatter) Flush() {
}

375
vendor/github.com/hashicorp/raft-wal/LICENSE generated vendored Normal file
View File

@ -0,0 +1,375 @@
Copyright (c) 2022 HashiCorp, Inc.
Mozilla Public License Version 2.0
==================================
1. Definitions
--------------
1.1. "Contributor"
means each individual or legal entity that creates, contributes to
the creation of, or owns Covered Software.
1.2. "Contributor Version"
means the combination of the Contributions of others (if any) used
by a Contributor and that particular Contributor's Contribution.
1.3. "Contribution"
means Covered Software of a particular Contributor.
1.4. "Covered Software"
means Source Code Form to which the initial Contributor has attached
the notice in Exhibit A, the Executable Form of such Source Code
Form, and Modifications of such Source Code Form, in each case
including portions thereof.
1.5. "Incompatible With Secondary Licenses"
means
(a) that the initial Contributor has attached the notice described
in Exhibit B to the Covered Software; or
(b) that the Covered Software was made available under the terms of
version 1.1 or earlier of the License, but not also under the
terms of a Secondary License.
1.6. "Executable Form"
means any form of the work other than Source Code Form.
1.7. "Larger Work"
means a work that combines Covered Software with other material, in
a separate file or files, that is not Covered Software.
1.8. "License"
means this document.
1.9. "Licensable"
means having the right to grant, to the maximum extent possible,
whether at the time of the initial grant or subsequently, any and
all of the rights conveyed by this License.
1.10. "Modifications"
means any of the following:
(a) any file in Source Code Form that results from an addition to,
deletion from, or modification of the contents of Covered
Software; or
(b) any new file in Source Code Form that contains any Covered
Software.
1.11. "Patent Claims" of a Contributor
means any patent claim(s), including without limitation, method,
process, and apparatus claims, in any patent Licensable by such
Contributor that would be infringed, but for the grant of the
License, by the making, using, selling, offering for sale, having
made, import, or transfer of either its Contributions or its
Contributor Version.
1.12. "Secondary License"
means either the GNU General Public License, Version 2.0, the GNU
Lesser General Public License, Version 2.1, the GNU Affero General
Public License, Version 3.0, or any later versions of those
licenses.
1.13. "Source Code Form"
means the form of the work preferred for making modifications.
1.14. "You" (or "Your")
means an individual or a legal entity exercising rights under this
License. For legal entities, "You" includes any entity that
controls, is controlled by, or is under common control with You. For
purposes of this definition, "control" means (a) the power, direct
or indirect, to cause the direction or management of such entity,
whether by contract or otherwise, or (b) ownership of more than
fifty percent (50%) of the outstanding shares or beneficial
ownership of such entity.
2. License Grants and Conditions
--------------------------------
2.1. Grants
Each Contributor hereby grants You a world-wide, royalty-free,
non-exclusive license:
(a) under intellectual property rights (other than patent or trademark)
Licensable by such Contributor to use, reproduce, make available,
modify, display, perform, distribute, and otherwise exploit its
Contributions, either on an unmodified basis, with Modifications, or
as part of a Larger Work; and
(b) under Patent Claims of such Contributor to make, use, sell, offer
for sale, have made, import, and otherwise transfer either its
Contributions or its Contributor Version.
2.2. Effective Date
The licenses granted in Section 2.1 with respect to any Contribution
become effective for each Contribution on the date the Contributor first
distributes such Contribution.
2.3. Limitations on Grant Scope
The licenses granted in this Section 2 are the only rights granted under
this License. No additional rights or licenses will be implied from the
distribution or licensing of Covered Software under this License.
Notwithstanding Section 2.1(b) above, no patent license is granted by a
Contributor:
(a) for any code that a Contributor has removed from Covered Software;
or
(b) for infringements caused by: (i) Your and any other third party's
modifications of Covered Software, or (ii) the combination of its
Contributions with other software (except as part of its Contributor
Version); or
(c) under Patent Claims infringed by Covered Software in the absence of
its Contributions.
This License does not grant any rights in the trademarks, service marks,
or logos of any Contributor (except as may be necessary to comply with
the notice requirements in Section 3.4).
2.4. Subsequent Licenses
No Contributor makes additional grants as a result of Your choice to
distribute the Covered Software under a subsequent version of this
License (see Section 10.2) or under the terms of a Secondary License (if
permitted under the terms of Section 3.3).
2.5. Representation
Each Contributor represents that the Contributor believes its
Contributions are its original creation(s) or it has sufficient rights
to grant the rights to its Contributions conveyed by this License.
2.6. Fair Use
This License is not intended to limit any rights You have under
applicable copyright doctrines of fair use, fair dealing, or other
equivalents.
2.7. Conditions
Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
in Section 2.1.
3. Responsibilities
-------------------
3.1. Distribution of Source Form
All distribution of Covered Software in Source Code Form, including any
Modifications that You create or to which You contribute, must be under
the terms of this License. You must inform recipients that the Source
Code Form of the Covered Software is governed by the terms of this
License, and how they can obtain a copy of this License. You may not
attempt to alter or restrict the recipients' rights in the Source Code
Form.
3.2. Distribution of Executable Form
If You distribute Covered Software in Executable Form then:
(a) such Covered Software must also be made available in Source Code
Form, as described in Section 3.1, and You must inform recipients of
the Executable Form how they can obtain a copy of such Source Code
Form by reasonable means in a timely manner, at a charge no more
than the cost of distribution to the recipient; and
(b) You may distribute such Executable Form under the terms of this
License, or sublicense it under different terms, provided that the
license for the Executable Form does not attempt to limit or alter
the recipients' rights in the Source Code Form under this License.
3.3. Distribution of a Larger Work
You may create and distribute a Larger Work under terms of Your choice,
provided that You also comply with the requirements of this License for
the Covered Software. If the Larger Work is a combination of Covered
Software with a work governed by one or more Secondary Licenses, and the
Covered Software is not Incompatible With Secondary Licenses, this
License permits You to additionally distribute such Covered Software
under the terms of such Secondary License(s), so that the recipient of
the Larger Work may, at their option, further distribute the Covered
Software under the terms of either this License or such Secondary
License(s).
3.4. Notices
You may not remove or alter the substance of any license notices
(including copyright notices, patent notices, disclaimers of warranty,
or limitations of liability) contained within the Source Code Form of
the Covered Software, except that You may alter any license notices to
the extent required to remedy known factual inaccuracies.
3.5. Application of Additional Terms
You may choose to offer, and to charge a fee for, warranty, support,
indemnity or liability obligations to one or more recipients of Covered
Software. However, You may do so only on Your own behalf, and not on
behalf of any Contributor. You must make it absolutely clear that any
such warranty, support, indemnity, or liability obligation is offered by
You alone, and You hereby agree to indemnify every Contributor for any
liability incurred by such Contributor as a result of warranty, support,
indemnity or liability terms You offer. You may include additional
disclaimers of warranty and limitations of liability specific to any
jurisdiction.
4. Inability to Comply Due to Statute or Regulation
---------------------------------------------------
If it is impossible for You to comply with any of the terms of this
License with respect to some or all of the Covered Software due to
statute, judicial order, or regulation then You must: (a) comply with
the terms of this License to the maximum extent possible; and (b)
describe the limitations and the code they affect. Such description must
be placed in a text file included with all distributions of the Covered
Software under this License. Except to the extent prohibited by statute
or regulation, such description must be sufficiently detailed for a
recipient of ordinary skill to be able to understand it.
5. Termination
--------------
5.1. The rights granted under this License will terminate automatically
if You fail to comply with any of its terms. However, if You become
compliant, then the rights granted under this License from a particular
Contributor are reinstated (a) provisionally, unless and until such
Contributor explicitly and finally terminates Your grants, and (b) on an
ongoing basis, if such Contributor fails to notify You of the
non-compliance by some reasonable means prior to 60 days after You have
come back into compliance. Moreover, Your grants from a particular
Contributor are reinstated on an ongoing basis if such Contributor
notifies You of the non-compliance by some reasonable means, this is the
first time You have received notice of non-compliance with this License
from such Contributor, and You become compliant prior to 30 days after
Your receipt of the notice.
5.2. If You initiate litigation against any entity by asserting a patent
infringement claim (excluding declaratory judgment actions,
counter-claims, and cross-claims) alleging that a Contributor Version
directly or indirectly infringes any patent, then the rights granted to
You by any and all Contributors for the Covered Software under Section
2.1 of this License shall terminate.
5.3. In the event of termination under Sections 5.1 or 5.2 above, all
end user license agreements (excluding distributors and resellers) which
have been validly granted by You or Your distributors under this License
prior to termination shall survive termination.
************************************************************************
* *
* 6. Disclaimer of Warranty *
* ------------------------- *
* *
* Covered Software is provided under this License on an "as is" *
* basis, without warranty of any kind, either expressed, implied, or *
* statutory, including, without limitation, warranties that the *
* Covered Software is free of defects, merchantable, fit for a *
* particular purpose or non-infringing. The entire risk as to the *
* quality and performance of the Covered Software is with You. *
* Should any Covered Software prove defective in any respect, You *
* (not any Contributor) assume the cost of any necessary servicing, *
* repair, or correction. This disclaimer of warranty constitutes an *
* essential part of this License. No use of any Covered Software is *
* authorized under this License except under this disclaimer. *
* *
************************************************************************
************************************************************************
* *
* 7. Limitation of Liability *
* -------------------------- *
* *
* Under no circumstances and under no legal theory, whether tort *
* (including negligence), contract, or otherwise, shall any *
* Contributor, or anyone who distributes Covered Software as *
* permitted above, be liable to You for any direct, indirect, *
* special, incidental, or consequential damages of any character *
* including, without limitation, damages for lost profits, loss of *
* goodwill, work stoppage, computer failure or malfunction, or any *
* and all other commercial damages or losses, even if such party *
* shall have been informed of the possibility of such damages. This *
* limitation of liability shall not apply to liability for death or *
* personal injury resulting from such party's negligence to the *
* extent applicable law prohibits such limitation. Some *
* jurisdictions do not allow the exclusion or limitation of *
* incidental or consequential damages, so this exclusion and *
* limitation may not apply to You. *
* *
************************************************************************
8. Litigation
-------------
Any litigation relating to this License may be brought only in the
courts of a jurisdiction where the defendant maintains its principal
place of business and such litigation shall be governed by laws of that
jurisdiction, without reference to its conflict-of-law provisions.
Nothing in this Section shall prevent a party's ability to bring
cross-claims or counter-claims.
9. Miscellaneous
----------------
This License represents the complete agreement concerning the subject
matter hereof. If any provision of this License is held to be
unenforceable, such provision shall be reformed only to the extent
necessary to make it enforceable. Any law or regulation which provides
that the language of a contract shall be construed against the drafter
shall not be used to construe this License against a Contributor.
10. Versions of the License
---------------------------
10.1. New Versions
Mozilla Foundation is the license steward. Except as provided in Section
10.3, no one other than the license steward has the right to modify or
publish new versions of this License. Each version will be given a
distinguishing version number.
10.2. Effect of New Versions
You may distribute the Covered Software under the terms of the version
of the License under which You originally received the Covered Software,
or under the terms of any subsequent version published by the license
steward.
10.3. Modified Versions
If you create software not governed by this License, and you want to
create a new license for such software, you may create and use a
modified version of this License if you rename the license and remove
any references to the name of the license steward (except to note that
such modified license differs from this License).
10.4. Distributing Source Code Form that is Incompatible With Secondary
Licenses
If You choose to distribute Source Code Form that is Incompatible With
Secondary Licenses under the terms of this version of the License, the
notice described in Exhibit B of this License must be attached.
Exhibit A - Source Code Form License Notice
-------------------------------------------
This Source Code Form is subject to the terms of the Mozilla Public
License, v. 2.0. If a copy of the MPL was not distributed with this
file, You can obtain one at http://mozilla.org/MPL/2.0/.
If it is not possible or desirable to put the notice in a particular
file, then You may include the notice in a location (such as a LICENSE
file in a relevant directory) where a recipient would be likely to look
for such a notice.
You may add additional accurate notices of copyright ownership.
Exhibit B - "Incompatible With Secondary Licenses" Notice
---------------------------------------------------------
This Source Code Form is "Incompatible With Secondary Licenses", as
defined by the Mozilla Public License, v. 2.0.

752
vendor/github.com/hashicorp/raft-wal/README.md generated vendored Normal file
View File

@ -0,0 +1,752 @@
# Raft WAL
This library implements a Write-Ahead Log (WAL) suitable for use with
[`hashicorp/raft`](https://github.com/hashicorp/raft).
Specifically the library provides and instance of raft's `LogStore` and
`StableStore` interfaces for storing both raft logs and the other small items
that require stable storage (like which term the node last voted in).
**This library is still considered experimental!**
It is complete and reasonably well tested so far but we plan to complete more
rigorous end-to-end testing and performance analysis within our products and
together with some of our users before we consider this safe for production.
The advantage of this library over `hashicorp/raft-boltdb` that has been used
for many years in HashiCorp products are:
1. Efficient truncations that don't cause later appends to slow down due to
free space tracking issues in BoltDB's btree.
2. More efficient appends due to only one fsync per append vs two in BoltDB.
3. More efficient and suitable on-disk structure for a log vs a copy-on-write
BTree.
We aim to provide roughly equivalent resiliency to crashes as respected storage
systems such as SQLite, LevelDB/RocksDB and etcd. BoltDB technically has a
stronger property due to it's page-aligned model (no partial sector overwrites).
We initially [designed a WAL on the same principals](/01-WAL-pages.md),
however felt that the additional complexity it adds wasn't justified given the
weaker assumptions that many other battle-tested systems above use.
Our design goals for crash recovery are:
- Crashes at any point must not loose committed log entries or result in a
corrupt file, even if in-flight sector writes are not atomic.
- We _do_ assume [Powersafe Overwrites](#powersafe-overwrites-psow) where
partial sectors can be appended to without corrupting existing data even in a
power failure.
- Latent errors (i.e. silent corruption in the FS or disk) _may_ be detected
during a read, but we assume that the file-system and disk are responsible
for this really. (i.e. we don't validate checksums on every record read).
This is equivalent to SQLite, LMDB, BoltDB etc.
See the [system assumptions](#system-assumptions) and [crash
safety](#crash-safety) sections for more details.
## Limitations
Here are some notable (but we think acceptable) limitations of this design.
* Segment files can't be larger than 4GiB. (Current default is 64MiB).
* Individual records can't be larger than 4GiB without changing the format.
(Current limit is 64MiB).
* Appended log entries must have monotonically increasing `Index` fields with
no gaps (though may start at any index in an empty log).
* Only head or tail truncations are supported. `DeleteRange` will error if the
range is not a prefix of suffix of the log. `hashicorp/raft` never needs
that.
* No encryption or compression support.
* Though we do provide a pluggable entry codec and internally treat each
entry as opaque bytes so it's possible to supply a custom codec that
transforms entries in any way desired.
* If the segment tail file is lost _after_ entries are committed to it due to
manual intervention or filesystem bug, the WAL can't distinguish that from a
crash during rotation that left the file missing since we don't update
metadata on every append for performance reasons. In most other cases,
missing data would be detected on recovery and fail the recovery to protect
from silent data loss, but in this particular case that's not possible
without significantly impacting performance in the steady state by updating
the last committed entry to meta DB on every append. We assume this is
reasonable since previous LogStore implementations would also "silently"
loose data if the database files were removed too.
## Storage Format Overview
The WAL has two types of file: a meta store and one or more log segments.
### Meta Store
We need to provide a `StableStore` interface for small amounts of Raft data. We
also need to store some meta data about log segments to simplify managing them
in an atomic and crash-safe way.
Since this data is _generally_ small we could invent our own storage format with
some sort of double-buffering and limit ourselves to a single page of data etc.
But since performance is not critical for meta-data operations and size is
extremely unlikely to get larger than a few KiB, we choose instead the pragmatic
approach of using BoltDB for our `wal-meta.db`.
The meta database contains two buckets: `stable` containing key/values persisted
by Raft via the `StableStore` interface, and `wal-state` which contains the
source-of-truth meta data about which segment files should be considered part of
the current log.
The `wal-state` bucket contains one record with all the state since it's only
loaded or persisted in one atomic batch and is small. The state is just a JSON
encoded object described by the following structs. JSON encoding is used as this
is not performance sensitive and it's simpler to work with and more human
readable.
```go
type PersistentState struct {
NextSegmentID uint64
Segments []SegmentInfo
}
type SegmentInfo struct {
ID uint64
BaseIndex uint64
MinIndex uint64
MaxIndex uint64
Codec uint64
IndexStart uint64
CreateTime time.Time
SealTime time.Time
}
```
The last segment (with highest baseIndex) is the "tail" and must be the only one where
`SealTime = 0` (i.e. it's unsealed). `IndexStart` and `MaxIndex` are also zero until
the segments is sealed.
Why use BoltDB when the main reason for this library is because the existing
BoltDB `LogStore` has performance issues?
Well, the major performance issue in `raft-boltdb` occurs when a large amount of
log data is written and then truncated, the overhead of tracking all the free
space in the file makes further appends slower.
Our use here is orders of magnitude lighter than storing all log data. As an
example, let's assume we allow 100GiB of logs to be kept around which is at
least an order of magnitude larger than the largest current known Consul user's
worst-case log size, and two orders of magnitude more than the largest Consul
deployments steady-state. Assuming fixed 64MiB segments, that would require
about 1600 segments which encode to about 125 bytes in JSON each. Even at this
extreme, the meta DB only has to hold under 200KiB.
Even if a truncation occurs that reduces that all the way back to a single
segment, 200KiB is only a hundred or so pages (allowing for btree overhead) so
the free list will never be larger than a single 4KB page.
On top of that, we only pay the cost of a write to BoltDB for meta-data
transactions: rotating to a new segment, or truncating. The vast majority of
appends only need to append to a log segment.
### Segment Files
Segment files are pre-allocated (if supported by the filesystem) on creation to
a fixed size. By default we use 64MiB segment files. This sections defines the
encoding for those files. All integer types are encoded in little-endian order.
The file starts with a fixed-size header that is written once before the
first comitted entries.
```
0 1 2 3 4 5 6 7 8
+------+------+------+------+------+------+------+------+
| Magic | Reserved | Vsn |
+------+------+------+------+------+------+------+------+
| BaseIndex |
+------+------+------+------+------+------+------+------+
| SegmentID |
+------+------+------+------+------+------+------+------+
| Codec |
+------+------+------+------+------+------+------+------+
```
| Field | Type | Description |
| ------------ | --------- | ----------- |
| `Magic` | `uint32` | The randomly chosen value `0x58eb6b0d`. |
| `Reserved` | `[3]byte` | Bytes reserved for future file flags. |
| `Vsn` | `uint8` | The version of the file, currently `0x0`. |
| `BaseIndex` | `uint64` | The raft Index of the first entry that will be stored in this file. |
| `SegmentID` | `uint64` | A unique identifier for this segment file. |
| `Codec` | `uint64` | The codec used to write the file. |
Each segment file is named `<BaseIndex>-<SegmentID>.wal`. `BaseIndex` is
formatted in decimal with leading zeros and a fixed width of 20 chars.
`SegmentID` is formatted in lower-case hex with zero padding to 16 chars wide.
This has the nice property of them sorting lexicographically in the directory,
although we don't rely on that.
### Frames
Log entries are stored in consecutive frames after the header. As well as log
entry frames there are a few meta data frame types too. Each frame starts with
an 8-byte header.
```
0 1 2 3 4 5 6 7 8
+------+------+------+------+------+------+------+------+
| Type | Reserved | Length/CRC |
+------+------+------+------+------+------+------+------+
```
| Field | Type | Description |
| ------------- | ----------- | ----------- |
| `Type` | `uint8` | The frame type. See below. |
| `Length/CRC` | `uint32` | Depends on Type. See Below |
| Type | Value | Description |
| ---- | ----- | ----------- |
| `Invalid` | `0x0` | The frame is invalid. We make zero value invalid so we can detect unwritten frames cleanly. |
| `Entry` | `0x1` | The frame contains an entire log entry. |
| `Index` | `0x2` | The frame contains an index array, not actual log entries. |
| `Commit` | `0x3` | The frame contains a CRC for all data written in a batch. |
#### Index Frame
An index frame payload is an array of `uint32` file offsets for the
correspoinding records. The first element of the array contains the file offset
of the frame containing the first entry in the segment and so on.
`Length` is used to indicate the length in bytes of the array (i.e. number of
entries in the segments is `Length/4`).
Index frames are written only when the segment is sealed and a commit frame
follows to validate the final write.
#### Commit Frame
A Commit frame marks the last write before fsync is called. In order to detect
incomplete or torn writes on recovery the commit frame stores a CRC of all the
bytes appended since the last fsync.
`CRC` is used to specify a CRC32 (Castagnoli) over all bytes written since the
last fsync. That is, since just after the last commit frame, or just after the
file header.
There may also be 4 bytes of padding to keep alignment. Later we could
use these too.
#### Alignment
All frame headers are written with 8-byte alignment to ensure they remain in a
single disk sector. We don't entirely depend on atomic sector writes for
correctness, but it's a simple way to improve our chances or being able to read
through the file on a recovery with some sectors missing.
We add an implicit 0-7 null bytes after each frame to ensure the next frame
header is aligned. This padding is _not_ represented in `Length` but it is
always present and is deterministic by rounding up `Length` to the nearest
multiple of 8. It is always accounted for when reading and CRCs are calculated
over raw bytes written so always include the padding (zero) bytes.
Despite alignment we still don't blindly trust the headers we read are valid. A
CRC mismatch or invalid record format indicate torn writes in the last batch
written and we always safety check the size of lengths read before allocating
memory for them - Entry lengths can't be bigger than the `MaxEntrySize` which
we default to 64MiB.
### Sealing
Once a segment file has grown larger than the configured soft-limit (64MiB
default), we "seal" it. This process involves:
1. Write out the in-memory index of record offsets to an index frame.
2. Write a commit frame to validate all bytes appended in this final append
(which probably included one or more records that took the segment file over
the limit).
3. Return the final `IndexStart` to be stored in `wal-meta.db`
Sealed files can have their indexes read directly on open from the IndexStart in
`wal-meta.db` so records can be looked up in constant time.
## Log Lookup by Index
For an unsealed segment we first lookup the offset in the in-memory index.
For a sealed segment we can discover the index frame location from the metadata
and then perform a read at the right location in the file to lookup the record's
offset. Implementations may choose to cache or memory-map the index array but we
will initially just read the specific entry we need each time and assume the OS
page cache will make that fast for frequently accessed index areas or in-order
traversals. We don't have to read the whole index, just the 4 byte entry we care
about since we can work out it's offset from IndexStart, the BaseIndex of the
segment, and the Index being searched for.
# Crash Safety
Crash safety must be maintained through three type of write operation: appending
a batch of entries, truncating from the head (oldest) entries, and truncating
the newest entries.
## Appending Entries
We want to `fsync` only once for an append batch, however many entries were in
it. We assume [Powersafe Overwrites](#powersafe-overwrites-psow) or PSOW, a
weaker assumption than atomic sector writes in general. Thanks to PSOW, we
assume we can start appending at the tail of the file right after previously
committed entries even if the new entries will be written to the same sector as
the older entries, and that the system will never corrupt the already committed
part of the sector even if it is not atomic and arbitrarily garbles the part of
the sector we actually did write.
At the end of the batch we write a `Commit` frame containing the CRC over the
data written during the current batch.
In a crash one of the following states occurs:
1. All sectors modified across all frames make it to disk (crash _after_ fsync).
2. A torn write: one or more sectors, anywhere in the modified tail of the file
might not be persisted. We don't assume they are zero, they might be
arbitrarily garbled (crash _before_ fsync).
We can check which one of these is true with the recovery procedure outlined
below. If we find the last batch _was_ torn. It must not have been acknowledged
to Raft yet (since `fsync` can't have returned) and so it is safe to assume that
the previous commit frame is the tail of the log we've actually acknowledged.
### Recovery
We cover recovering the segments generally below since we have to account for
truncations. All segments except the tail were fsynced during seal before the
new tail was added to the meta DB so we can assume they are all made it to disk
if a later tail was added.
On startup we just need to recover the tail log as follows:
1. If the file doesn't exist, create it from Meta DB information. DONE.
2. Open file and validate header matches filename. If not delete it and go to 1.
3. Read all records in the file in sequence, keeping track of the last two
commit frames observed.
1. If the file ends with a corrupt frame or non commit frame, discard
anything after the last commit frame. We're DONE because we wouldn't have
written extra frames after commit until fsync completed so this commit
must have been acknowledged.
1. Else the file ends with a commit frame. Validate its checksum. If it is good DONE.
2. If CRC is not good then discard everything back to previous commit frame and DONE.
4. If we read an index frame in that process and the commit frame proceeding it
is the new tail then mark the segment as sealed and return the seal info
(crash occured after seal but before updating `wal-meta.db`)
## Head Truncations
The most common form of truncation is a "head" truncation or removing the oldest
prefix of entries after a periodic snapshot has been made to reclaim space.
To be crash safe we can't rely on atomically updating or deleting multiple
segment files. The process looks like this.
1. In one transaction on Meta DB:
1. Update the `meta.min_index` to be the new min.
2. Delete any segments from the `segments` bucket that are sealed and where
their highest index is less than the new min index.
3. Commit Txn. This is the commit point for crash recovery.
2. Update in memory segment state to match (if not done already with a lock
held).
3. Delete any segment files we just removed from the meta DB.
### Recovery
The meta data update is crash safe thanks to BoltDB being the source of truth.
1. Reload meta state from Meta DB.
2. Walk the files in the dir.
2. For each one:
1. Check if that file is present in Meta DB. If not mark it for deletion.
2. (optionally) validate the file header file size and final block trailer
to ensure the file appears to be well-formed and contain the expected
data.
4. Delete the obsolete segments marked (could be done in a background thread).
## Tail Truncations
Raft occasionally needs to truncate entries from the tail of the log, i.e.
remove the _most recent_ N entries. This can occur when a follower has
replicated entries from an old leader that was partitioned with it, but later
discovers they conflict with entries committed by the new leader in a later
term. The bounds on how long a partitioned leader can continue to replicate to
a follower are generally pretty small (30 seconds or so) so it's unlikely that
the number of records to be truncated will ever be large compared to the size
of a segment file, but we have to account for needing to delete one or more
segment files from the tail, as well as truncate older entries out of the new
tail.
This follows roughly the same pattern as head-truncation, although there is an
added complication. A naive implementation that used only the baseIndex as a
segment file name could in theory get into a tricky state where it's ambiguous
whether the tail segment is an old one that was logically truncated away but we
crashed before actually unlinking, or a new replacement with committed data in.
It's possible to solve this with complex transactional semantics but we take
the simpler approach of just assigning every segment a unique identifier
separate from it's baseIndex. So to truncate the tail follows the same
procedure as the head above: segments we remove from Meta DB can be
un-ambiguously deleted on recovery because their IDs won't match even if later
segments end up with the same baseIndex.
Since these truncations are generally rare and disk space is generally not a
major bottleneck, we also choose not to try to actually re-use a segment file
that was previously written and sealed by truncating it etc. Instead we just
mark it as "sealed" in the Meta DB and with a MaxIndex of the highest index
left after the truncation (which we check on reads) and start a new segment at
the next index.
## System Assumptions
There are no straight answers to any question about which guarantees can be
reliably relied on across operating systems, file systems, raid controllers and
hardware devices. We state [our assumptions](#our-assumptions) followed by a
summary of the assumptions made by some other respectable sources for
comparison.
### Our Assumptions
We've tried to make the weakest assumptions we can while still keeping things
relatively simple and performant.
We assume:
1. That while silent latent errors are possible, they are generally rare and
there's not a whole lot we can do other than return a `Corrupt` error on
read. In most cases the hardware or filesystem will detect and return an
error on read anyway for latent corruption. Not doing so is regarded as a
bug in the OS/filesystem/hardware. For this reason we don't go out of our
way to checksum everything to protect against "bitrot". This is roughly
equivalent to assumptions in BoltDB, LMDB and SQLite.
While we respect the work in [Protocol Aware Recovery for Consensus-based
Storage](https://www.usenix.org/system/files/conference/fast18/fast18-alagappan.pdf)
we choose not to implement a WAL format that allows identifying the index
and term of "lost" records on read errors so they can be recovered from
peers. This is mostly for the pragmatic reason that the Raft library this is
designed to work with would need a major re-write to take advantage of that
anyway. The proposed format in that paper also seems to make stronger
assumptions about sector atomicity than we are comfortable with too.
2. That sector writes are _not_ atomic. (Equivalent to SQLite, weaker than
almost everything else).
3. That writing a partial sector does _not_ corrupt any already stored data in
that sector outside of the range being written (
[PSOW](#powersafe-overwrites-psow)), (Equivalent to SQLite's defaults,
RocksDB and Etcd).
3. That `fsync` as implemented in Go's standard library actually flushes all
written sectors of the file to persistent media.
4. That `fsync` on a parent dir is sufficient to ensure newly created files are
not lost after a crash (assuming the file itself was written and `fsync`ed
first).
6. That appending to files may not be atomic since the filesystem metadata
about the size of the file may not be updated atomically with the data.
Generally we pre-allocate files where possible without writing all zeros but
we do potentially extend them if the last batch doesn't fit into the
allocated space or the filesystem doesn't support pre-allocation. Either way
we don't rely on the filesystem's reported size and validate the tail is
coherent on recovery.
### Published Paper on Consensus Disk Recovery
In the paper on [Protocol Aware Recovery for Consensus-based
Storage](https://www.usenix.org/system/files/conference/fast18/fast18-alagappan.pdf)
the authors assume that corruptions of the log can happen due to either torn
writes (for multi-sector appends) or latent corruptions after commit. They
explain the need to detect which it was because torn writes only loose
un-acknowledged records and so are safe to detect and truncate, while corruption
of previously committed records impacts the correctness of the protocol more
generally. Their whole paper seems to indicate that these post-commit
corruptions are a major problem that needs to be correctly handled (which may
well be true). On the flip side, their WAL format design writes a separate index
and log, and explicitly assumes that because the index entries are smaller than
a 512 sector size, that those are safe from corruption during a write.
The core assumptions here are:
1. Latent, silent corruption of committed data needs to be detected at
application layer with a checksum per record checked on every read.
2. Sector writes are atomic.
3. Sector writes have [powersafe overwrites](#powersafe-overwrites-psow).
### SQLite
The SQLite authors have a [detailed explanation of their system
assumptions](https://www.sqlite.org/atomiccommit.html) which impact correctness
of atomic database commits.
> SQLite assumes that the detection and/or correction of bit errors caused by cosmic rays, thermal noise, quantum fluctuations, device driver bugs, or other mechanisms, is the responsibility of the underlying hardware and operating system. SQLite does not add any redundancy to the database file for the purpose of detecting corruption or I/O errors. SQLite assumes that the data it reads is exactly the same data that it previously wrote.
Is very different from the above paper authors whose main point of their paper
is predicated on how to recover from silent corruptions of the file caused by
hardware, firmware or filesystem errors on read.
Note that this is a pragmatic position rather than a naive one: the authors are
certainly aware that file-systems have bugs, that faulty raid controllers exist
and even that hardware anomalies like high-flying or poorly tracking disk heads
can happen but choose _not_ to protect against that _at all_. See their
[briefing for linux kernel
developers](https://sqlite.org/lpc2019/doc/trunk/briefing.md) for more details
on the uncertainty they understand exists around these areas.
> SQLite has traditionally assumed that a sector write is not atomic.
These statements are on a page with this disclaimer:
> The information in this article applies only when SQLite is operating in "rollback mode", or in other words when SQLite is not using a write-ahead log.
[WAL mode](https://sqlite.org/wal.html) docs are less explicit on assumptions
and how crash recovery is achieved but we can infer some things from the [file
format](https://sqlite.org/fileformat2.html#walformat) and
[code](https://github.com/sqlite/sqlite/blob/master/src/wal.c) though.
> The WAL header is 32 bytes in size...
> Immediately following the wal-header are zero or more frames. Each frame consists of a 24-byte frame-header followed by a page-size bytes of page data.
So each dirty page is appended with a 24 byte header making it _not_ sector
aligned even though pages must be a multiple of sector size.
Commit frames are also appended in the same way (and fsync called if enabled as
an option). If fsync is enabled though (and POWERSAFE_OVERWRITE disabled),
SQLite will "pad" to the next sector boundary (or beyond) by repeating the last
frame until it's passed that boundary. For some reason, they take great care to
write up to the sector boundary, sync then write the rest. I assume this is just
to avoid waiting to flush the redundant padding bytes past the end of the sector
they care about. Padding prevents the next append from potentially overwriting
the committed frame's sector.
But...
> By default, SQLite assumes that an operating system call to write a range of bytes will not damage or alter any bytes outside of that range even if a power loss or OS crash occurs during that write. We call this the "powersafe overwrite" property. Prior to version 3.7.9 (2011-11-01), SQLite did not assume powersafe overwrite. But with the standard sector size increasing from 512 to 4096 bytes on most disk drives, it has become necessary to assume powersafe overwrite in order to maintain historical performance levels and so powersafe overwrite is assumed by default in recent versions of SQLite.
> [assuming no power safe overwrite] In WAL mode, each transaction had to be padded out to the next 4096-byte boundary in the WAL file, rather than the next 512-byte boundary, resulting in thousands of extra bytes being written per transaction.
> SQLite never assumes that database page writes are atomic, regardless of the PSOW setting.(1) And hence SQLite is always able to automatically recover from torn pages induced by a crash. Enabling PSOW does not decrease SQLite's ability to recover from a torn page.
So they basically changed to make SSDs performant and now assume _by default_
that appending to a partial sector won't damage other data. The authors are
explicit that ["powersafe overwrite"](#powersafe-overwrites-psow) is a separate
property from atomicity and they still don't rely on sector atomicity. But they
do now assume powersafe overwrites by default.
To summarize, SQLite authors assume:
1. Latent, silent corruptions of committed data should be caught by the file
system or hardware and so should't need to be accounted for in application
code.
2. Sector writes are _not_ atomic, but...
3. Partial sector overwrites can't corrupt committed data in same sector (by
default).
### Etcd WAL
The authors of etcd's WAL similarly to the authors of the above paper indicate
the need to distinguish between torn writes and silent corruptions.
They maintain a rolling checksum of all records which is used on recovery only
which would imply they only care about torn writes since per-record checksums
are not checked on subsequent reads from the file after recovery. But they have
specific code to distinguish between torn writes and "other" corruption during
recovery.
They are careful to pad every record with 0 to 7 bytes such that the length
prefix for the next record is always 8-byte aligned and so can't span more than
one segment.
But their method of detecting a torn-write (rather than latent corruption)
relies on reading through every 512 byte aligned slice of the set of records
whose checksum has failed to match and seeing if there are any entirely zero
sectors.
This seems problematic in a purely logical way regardless of disk behavior: if a
legitimate record contains more than 1kb of zero bytes and happens to ever be
corrupted after writing, that record will be falsely detected as a torn-write
because at least one sector will be entirely zero bytes. In practice this
doesn't matter much because corruptions caused by anything other than torn
writes are likely very rare but it does make me wonder why bother trying to tell
the difference.
The implied assumptions in their design are:
1. Latent, silent corruption needs to be detected on recovery, but not on every
read.
2. Sector writes are atomic.
3. Partial sector writes don't corrupt existing data.
3. Torn writes (caused by multi-segment appends) always leave sectors all-zero.
### LMDB
Symas' Lightning Memory-mapped Database or LMDB is another well-used and
respected DB file format (along with Go-native port BoltDB used by Consul,
etcd and others).
LMDB writes exclusively in whole 4kb pages. LMDB has a copy-on-write design
which reuses free pages and commits transactions using the a double-buffering
technique: writing the root alternately to the first and second pages of the
file. Individual pages do not have checksums and may be larger than the physical
sector size. Dirty pages are written out to new or un-used pages and then
`fsync`ed before the transaction commits so there is no reliance on atomic
sector writes for data pages (a crash might leave pages of a transaction
partially written but they are not linked into the tree root yet so are ignored
on recovery).
The transaction commits only after the double-buffered meta page is written
out. LMDB relies on the fact that the actual content of the meta page is small
enough to fit in a single sector to avoid "torn writes" on the meta page. (See
[the authors
comments](https://ayende.com/blog/162856/reviewing-lightning-memory-mapped-database-library-transactions-commits)
on this blog). Although sector writes are assumed to be atomic, there is no
reliance on partial sector writes due to the paged design.
The implied assumptions in this design are:
1. Latent, silent corruptions of committed data should be caught by the file
system or hardware and so should't need to be accounted for in application
code.
2. Sector writes _are_ atomic.
3. No assumptions about Powersafe overwrite since all IO is in whole pages.
### BoltDB
BoltDB is a Go port of LMDB so inherits almost all of the same design
assumptions. One notable different is that the author added a checksum to
metadata page even though it still fits in a single sector. The author noted
in private correspondence that this was probably just a defensive measure
rather than a fix for a specific identified flaw in LMDB's design.
Initially this was _not_ used to revert to the alternate page on failure because
it was still assumed that meta fit in a single sector and that those writes were
atomic. But [a report of Docker causing corruption on a
crash](https://github.com/boltdb/bolt/issues/548) seemed to indicate that the
atomic sector writes assumption _was not safe_ alone and so the checksum was
used to detect non-atomic writes even on the less-than-a-sector meta page.
BoltDB is also an important base case for our WAL since it is used as the
current log store in use for many years within Consul and other HashiCorp
products.
The implied assumptions in this design are:
1. Latent, silent corruptions of committed data should be caught by the file
system or hardware and so should't need to be accounted for in application
code.
2. Sector writes are _not_ atomic.
3. No assumptions about Powersafe overwrite since all IO is in whole pages.
### RocksDB WAL
RocksDB is another well-respected storage library based on Google's LevelDB.
RocksDB's [WAL
Format](https://github.com/facebook/rocksdb/wiki/Write-Ahead-Log-File-Format)
uses blocks to allow skipping through files and over corrupt records (which
seems dangerous to me in general but perhaps they assume only torn-write
corruptions are possible?).
Records are packed into 32KiB blocks until they don't fit. Records that are
larger use first/middle/last flags (which inspired this library) to consume
multiple blocks.
RocksDB WAL uses pre-allocated files but also re-uses old files on a circular
buffer pattern since they have tight control of how much WAL is needed. This
means they might be overwriting old records in place.
Each record independently gets a header with a checksum to detect corruption or
incomplete writes, but no attempt is made to avoid sector boundaries or partial
block writes - the current block is just appended to for each write.
Implied assumptions:
1. No Latent Corruptions? This isn't totally clear from the code or docs, but
the docs indicate that a record with a mismatching checksum can simply be
skipped over which would seem to violate basic durability properties for a
database if they were already committed. That would imply that checksums
only (correctly) detect torn writes with latent corruption not accounted
for.
2. Sector writes _are_ atomic.
3. Partial sector writes don't corrupt existing data.
### Are Sector Writes Atomic?
Russ Cox asked this on twitter and tweeted a link to an [excellent Stack
Overflow
answer](https://stackoverflow.com/questions/2009063/are-disk-sector-writes-atomic)
about this by one of the authors of the NVME spec.
> TLDR; if you are in tight control of your whole stack from application all the way down the the physical disks (so you can control and qualify the whole lot) you can arrange to have what you need to make use of disk atomicity. If you're not in that situation or you're talking about the general case, you should not depend on sector writes being atomic.
Despite this, _most_ current best-of-breed database libraries (notably except
SQLite and potentially BoltDB), [many linux file
systems](https://lkml.org/lkml/2009/8/24/156), and all academic papers on disk
failure modes I've found so far _do_ assume that sector writes are atomic.
I assume that the authors of these file systems, databases and papers are not
unaware of the complexities described in the above link or the possibility of
non-atomic sector writes, but rather have chosen to put those outside of the
reasonable recoverable behavior of their systems. The actual chances of
encountering a non-atomic sector write in a typical, modern system appear to be
small enough that these authors consider that a reasonable assumption even when
it's not a guarantee that can be 100% relied upon. (Although the Docker bug
linked above for [BoltDB](#boltdb) seems to indicate a real-world case of this
happening in a modern environment.)
### Powersafe Overwrites (PSOW)
A more subtle property that is a weaker assumption that full sector atomicity is
termed by the [SQLite authors as "Powersafe
Overwrites"](https://www.sqlite.org/psow.html) abbreviated PSOW.
> By default, SQLite assumes that an operating system call to write a range of bytes will not damage or alter any bytes outside of that range even if a power loss or OS crash occurs during that write. We call this the "powersafe overwrite" property. Prior to version 3.7.9 (2011-11-01), SQLite did not assume powersafe overwrite. But with the standard sector size increasing from 512 to 4096 bytes on most disk drives, it has become necessary to assume powersafe overwrite in order to maintain historical performance levels and so powersafe overwrite is assumed by default in recent versions of SQLite.
Those who assume atomic sector writes _also_ assume this property but the
reverse need not be true. SQLite's authors in the page above assume nothing
about the atomicity of the actual data written to any sector still even when
POWERSAFE_OVERWRITE is enabled (which is now the default). They simply assume
that no _other_ data is harmed while performing a write that overlaps other
sectors, even if power fails.
It's our view that while there certainly can be cases where this assumption
doesn't hold, it's already weaker than the atomic sector write assumption that
most reliable storage software assumes today and so is safe to assume on for
this case.
### Are fsyncs reliable?
Even when you explicitly `fsync` a file after writing to it, some devices or
even whole operating systems (e.g. macOS) _don't actually flush to disk_ to
improve performance.
In our case, we assume that Go's `os.File.Sync()` method is makes the best
effort it can on all modern OSes. It does now at least behave correctly on macOS
(since Go 1.12). But we can't do anything about a lying hardware device.
# Future Extensions
* **Auto-tuning segment size.** This format allows for segments to be different
sizes. We could start with a smaller segment size of say a single 1MiB block
and then measure how long it takes to fill each segment. If segments fill
quicker than some target rate we could double the allocated size of the next
segment. This could mean a burst of writes makes the segments grow and then
the writes slow down but the log would take a long time to free disk space
because the segments take so long to fill. Arguably not a terrible problem,
but we could also have it auto tune segment size down when write rate drops
too. The only major benefit here would be to allow trivial usages like tests
not need a whole 64MiB of disk space to just record a handful of log entries.
But those could also just manually configure a smaller segment size.
# References
In no particular order.
**Files and Crash Recovery**
* [Files are hard](https://danluu.com/file-consistency/)
* [Files are fraught with peril](https://danluu.com/deconstruct-files/)
* [Ensuring data reaches disk](https://lwn.net/Articles/457667/)
* [Write Atomicity and NVME Device Design](https://www.bswd.com/FMS12/FMS12-Rudoff.pdf)
* [Durability: NVME Disks](https://www.evanjones.ca/durability-nvme.html)
* [Intel SSD Durability](https://www.evanjones.ca/intel-ssd-durability.html)
* [Are Disk Sector Writes Atomic?](https://stackoverflow.com/questions/2009063/are-disk-sector-writes-atomic/61832882#61832882)
* [Protocol Aware Recovery for Consensus-based Storage](https://www.usenix.org/system/files/conference/fast18/fast18-alagappan.pdf)
* [Atomic Commit in SQLite](https://www.sqlite.org/atomiccommit.html)
* ["Powersafe Overwrites" in SQLite](https://www.sqlite.org/psow.html)
* [An Analysis of Data Corruption in the Storage Stack](https://www.cs.toronto.edu/~bianca/papers/fast08.pdf)
**DB Design and Storage File layout**
* [BoltDB Implementation](https://github.com/boltdb/bolt)
* LMDB Design: [slides](https://www.snia.org/sites/default/files/SDC15_presentations/database/HowardChu_The_Lighting_Memory_Database.pdf), [talk](https://www.youtube.com/watch?v=tEa5sAh-kVk)
* [SQLite file layout](https://www.sqlite.org/fileformat.html)
**WAL implementations**
* [SQLite WAL Mode](https://sqlite.org/wal.html)
* [RocksDB WAL Format](https://github.com/facebook/rocksdb/wiki/Write-Ahead-Log-File-Format)]
* [etcd implementation](https://github.com/etcd-io/etcd/tree/master/wal)

167
vendor/github.com/hashicorp/raft-wal/codec.go generated vendored Normal file
View File

@ -0,0 +1,167 @@
// Copyright (c) HashiCorp, Inc
// SPDX-License-Identifier: MPL-2.0
package wal
import (
"encoding/binary"
"io"
"time"
"github.com/hashicorp/raft"
)
const (
// FirstExternalCodecID is the lowest value an external code may use to
// identify their codec. Values lower than this are reserved for future
// internal use.
FirstExternalCodecID = 1 << 16
// Codec* constants identify internally-defined codec identifiers.
CodecBinaryV1 uint64 = iota
)
// Codec is the interface required for encoding/decoding log entries. Callers
// can pass a custom one to manage their own serialization, or to add additional
// layers like encryption or compression of records. Each codec
type Codec interface {
// ID returns the globally unique identifier for this codec version. This is
// encoded into segment file headers and must remain consistent over the life
// of the log. Values up to FirstExternalCodecID are reserved and will error
// if specified externally.
ID() uint64
// Encode the log into the io.Writer. We pass a writer to allow the caller to
// manage buffer allocation and re-use.
Encode(l *raft.Log, w io.Writer) error
// Decode a log from the passed byte slice into the log entry pointed to. This
// allows the caller to manage allocation and re-use of the bytes and log
// entry. The resulting raft.Log MUST NOT reference data in the input byte
// slice since the input byte slice may be returned to a pool and re-used.
Decode([]byte, *raft.Log) error
}
// BinaryCodec is a Codec that encodes raft.Log with a simple binary format. We
// test that all fields are captured using reflection.
//
// For now we assume raft.Log is not likely to change too much. If it does we'll
// use a new Codec ID for the later version and have to support decoding either.
type BinaryCodec struct{}
// ID returns the globally unique identifier for this codec version. This is
// encoded into segment file headers and must remain consistent over the life
// of the log. Values up to FirstExternalCodecID are reserved and will error
// if specified externally.
func (c *BinaryCodec) ID() uint64 {
return CodecBinaryV1
}
// Encode the log into the io.Writer. We pass a writer to allow the caller to
// manage buffer allocation and re-use.
func (c *BinaryCodec) Encode(l *raft.Log, w io.Writer) error {
enc := encoder{w: w}
enc.varint(l.Index)
enc.varint(l.Term)
enc.varint(uint64(l.Type))
enc.bytes(l.Data)
enc.bytes(l.Extensions)
enc.time(l.AppendedAt)
return enc.err
}
// Decode a log from the passed byte slice into the log entry pointed to. This
// allows the caller to manage allocation and re-use of the bytes and log
// entry.
func (c *BinaryCodec) Decode(bs []byte, l *raft.Log) error {
dec := decoder{buf: bs}
l.Index = dec.varint()
l.Term = dec.varint()
l.Type = raft.LogType(dec.varint())
l.Data = dec.bytes()
l.Extensions = dec.bytes()
l.AppendedAt = dec.time()
return dec.err
}
type encoder struct {
w io.Writer
err error
scratch [10]byte
}
func (e *encoder) varint(v uint64) {
if e.err != nil {
return
}
// Varint encoding might use up to 9 bytes for a uint64
n := binary.PutUvarint(e.scratch[:], v)
_, e.err = e.w.Write(e.scratch[:n])
}
func (e *encoder) bytes(bs []byte) {
// Put a length prefix
e.varint(uint64(len(bs)))
if e.err != nil {
return
}
// Copy the bytes to the writer
_, e.err = e.w.Write(bs)
}
func (e *encoder) time(t time.Time) {
if e.err != nil {
return
}
bs, err := t.MarshalBinary()
if err != nil {
e.err = err
return
}
_, e.err = e.w.Write(bs)
}
type decoder struct {
buf []byte
err error
}
func (d *decoder) varint() uint64 {
if d.err != nil {
return 0
}
v, n := binary.Uvarint(d.buf)
d.buf = d.buf[n:]
return v
}
func (d *decoder) bytes() []byte {
// Get length prefix
n := d.varint()
if d.err != nil {
return nil
}
if n == 0 {
return nil
}
if n > uint64(len(d.buf)) {
d.err = io.ErrShortBuffer
return nil
}
bs := make([]byte, n)
copy(bs, d.buf[:n])
d.buf = d.buf[n:]
return bs
}
func (d *decoder) time() time.Time {
var t time.Time
if d.err != nil {
return t
}
// Note that Unmarshal Binary updates d.buf to remove the bytes it read
// already.
d.err = t.UnmarshalBinary(d.buf)
return t
}

40
vendor/github.com/hashicorp/raft-wal/fs/file.go generated vendored Normal file
View File

@ -0,0 +1,40 @@
// Copyright (c) HashiCorp, Inc
// SPDX-License-Identifier: MPL-2.0
package fs
import (
"os"
"sync/atomic"
"github.com/hashicorp/raft-wal/types"
)
var _ types.WritableFile = &File{}
// File wraps an os.File and implements types.WritableFile. It ensures that the
// first time Sync is called on the file, that the parent directory is also
// Fsynced to ensure a crash won't cause the FS to forget the file is there.
//
// Postponing this allows us to ensure that we do the minimum necessary fsyncs
// but still ensure all required fsyncs are done by the time we acknowledge
// committed data in the new file.
type File struct {
new uint32 // atomically accessed, keep it aligned!
dir string
os.File
}
// Sync calls fsync on the underlying file. If this is the first call to Sync
// since creation it also fsyncs the parent dir.
func (f *File) Sync() error {
// Sync the underlying file
if err := f.File.Sync(); err != nil {
return err
}
new := atomic.SwapUint32(&f.new, 1)
if new == 0 {
return syncDir(f.dir)
}
return nil
}

128
vendor/github.com/hashicorp/raft-wal/fs/fs.go generated vendored Normal file
View File

@ -0,0 +1,128 @@
// Copyright (c) HashiCorp, Inc
// SPDX-License-Identifier: MPL-2.0
package fs
import (
"fmt"
"io/ioutil"
"math"
"os"
"path/filepath"
"github.com/coreos/etcd/pkg/fileutil"
"github.com/hashicorp/raft-wal/types"
)
// FS implements the wal.VFS interface using GO's built in OS Filesystem (and a
// few helpers).
//
// TODO if we changed the interface to be Dir centric we could cache the open
// dir handle and save some time opening it on each Create in order to fsync.
type FS struct {
}
func New() *FS {
return &FS{}
}
// ListDir returns a list of all files in the specified dir in lexicographical
// order. If the dir doesn't exist, it must return an error. Empty array with
// nil error is assumed to mean that the directory exists and was readable,
// but contains no files.
func (fs *FS) ListDir(dir string) ([]string, error) {
files, err := ioutil.ReadDir(dir)
if err != nil {
return nil, err
}
names := make([]string, len(files))
for i, f := range files {
if f.IsDir() {
continue
}
names[i] = f.Name()
}
return names, nil
}
// Create creates a new file with the given name. If a file with the same name
// already exists an error is returned. If a non-zero size is given,
// implementations should make a best effort to pre-allocate the file to be
// that size. The dir must already exist and be writable to the current
// process.
func (fs *FS) Create(dir string, name string, size uint64) (types.WritableFile, error) {
f, err := os.OpenFile(filepath.Join(dir, name), os.O_CREATE|os.O_EXCL|os.O_RDWR, os.FileMode(0644))
if err != nil {
return nil, err
}
// We just created the file. Preallocate it's size.
if size > 0 {
if size > math.MaxInt32 {
return nil, fmt.Errorf("maximum file size is %d bytes", math.MaxInt32)
}
if err := fileutil.Preallocate(f, int64(size), true); err != nil {
f.Close()
return nil, err
}
}
// We don't fsync here for performance reasons. Technically we need to fsync
// the file itself to make sure it is really persisted to disk, and you always
// need to fsync its parent dir after a creation because fsync doesn't ensure
// the directory entry is persisted - a crash could make the file appear to be
// missing as there is no directory entry.
//
// BUT, it doesn't actually matter if this file is crash safe, right up to the
// point where we actually commit log data. Since we always fsync the file
// when we commit logs, we don't need to again here. That does however leave
// the parent dir fsync which must be done after the first fsync to a newly
// created file to ensure it survives a crash.
//
// To handle that, we return a wrapped io.File that will fsync the parent dir
// as well the first time Sync is called (and only the first time),
fi := &File{
new: 0,
dir: dir,
File: *f,
}
return fi, nil
}
// Delete indicates the file is no longer required. Typically it should be
// deleted from the underlying system to free disk space.
func (fs *FS) Delete(dir string, name string) error {
if err := os.Remove(filepath.Join(dir, name)); err != nil {
return err
}
// Make sure parent directory metadata is fsynced too before we call this
// "done".
return syncDir(dir)
}
// OpenReader opens an existing file in read-only mode. If the file doesn't
// exist or permission is denied, an error is returned, otherwise no checks
// are made about the well-formedness of the file, it may be empty, the wrong
// size or corrupt in arbitrary ways.
func (fs *FS) OpenReader(dir string, name string) (types.ReadableFile, error) {
return os.OpenFile(filepath.Join(dir, name), os.O_RDONLY, os.FileMode(0644))
}
// OpenWriter opens a file in read-write mode. If the file doesn't exist or
// permission is denied, an error is returned, otherwise no checks are made
// about the well-formedness of the file, it may be empty, the wrong size or
// corrupt in arbitrary ways.
func (fs *FS) OpenWriter(dir string, name string) (types.WritableFile, error) {
return os.OpenFile(filepath.Join(dir, name), os.O_RDWR, os.FileMode(0644))
}
func syncDir(dir string) error {
f, err := os.Open(dir)
if err != nil {
return err
}
err = f.Sync()
closeErr := f.Close()
if err != nil {
return err
}
return closeErr
}

269
vendor/github.com/hashicorp/raft-wal/metadb/metadb.go generated vendored Normal file
View File

@ -0,0 +1,269 @@
// Copyright (c) HashiCorp, Inc
// SPDX-License-Identifier: MPL-2.0
package metadb
import (
"encoding/json"
"errors"
"fmt"
"os"
"path/filepath"
"github.com/hashicorp/raft-wal/types"
"go.etcd.io/bbolt"
)
const (
// FileName is the default file name for the bolt db file.
FileName = "wal-meta.db"
// *Bucket are the names used for internal bolt buckets
MetaBucket = "wal-meta"
StableBucket = "stable"
// We just need one key for now so use the byte 'm' for meta arbitrarily.
MetaKey = "m"
)
var (
// ErrUnintialized is returned when any call is made before Load has opened
// the DB file.
ErrUnintialized = errors.New("uninitialized")
)
// BoltMetaDB implements types.MetaStore using BoltDB as a reliable persistent
// store. See repo README for reasons for this design choice and performance
// implications.
type BoltMetaDB struct {
dir string
db *bbolt.DB
}
func (db *BoltMetaDB) ensureOpen(dir string) error {
if db.dir != "" && db.dir != dir {
return fmt.Errorf("can't load dir %s, already open in dir %s", dir, db.dir)
}
if db.db != nil {
return nil
}
fileName := filepath.Join(dir, FileName)
open := func() error {
bb, err := bbolt.Open(fileName, 0644, nil)
if err != nil {
return fmt.Errorf("failed to open %s: %w", FileName, err)
}
db.db = bb
db.dir = dir
return nil
}
// BoltDB can get stuck in invalid states if we crash while it's initializing.
// We can't distinguish those as safe to just wipe it and start again because
// we don't know for sure if it's failing due to bad init or later corruption
// (which would loose data if we just wipe and start over). So to ensure
// initial creation of the WAL is as crash-safe as possible we will manually
// detect we have an atomic init procedure:
// 1. Check if file exits already. If yes, skip init and just open it.
// 2. Delete any existing DB file with tmp name
// 3. Creat a new BoltDB that is empty and has the buckets with a temp name.
// 4. Once that's committed, rename to final name and Fsync parent dir
_, err := os.Stat(fileName)
if err == nil {
// File exists, just open it
return open()
}
if !errors.Is(err, os.ErrNotExist) {
// Unknown err just return that
return fmt.Errorf("failed to stat %s: %w", FileName, err)
}
// File doesn't exist, initialize a new DB in a crash-safe way
if err := safeInitBoltDB(dir); err != nil {
return fmt.Errorf("failed initializing meta DB: %w", err)
}
// All good, now open it!
return open()
}
func safeInitBoltDB(dir string) error {
tmpFileName := filepath.Join(dir, FileName+".tmp")
// Delete any old attempts to init that were unsuccessful
if err := os.RemoveAll(tmpFileName); err != nil {
return err
}
// Open bolt DB at tmp file name
bb, err := bbolt.Open(tmpFileName, 0644, nil)
if err != nil {
return err
}
tx, err := bb.Begin(true)
defer tx.Rollback()
if err != nil {
return err
}
_, err = tx.CreateBucket([]byte(MetaBucket))
if err != nil {
return err
}
_, err = tx.CreateBucket([]byte(StableBucket))
if err != nil {
return err
}
if err := tx.Commit(); err != nil {
return err
}
// Close the file ready to rename into place and re-open. This probably isn't
// necessary but it make it easier to reason about this code path being
// totally separate from the common case.
if err := bb.Close(); err != nil {
return err
}
// We created the DB OK. Now rename it to the final name.
if err := os.Rename(tmpFileName, filepath.Join(dir, FileName)); err != nil {
return err
}
// And Fsync that parent dir to make sure the new new file with it's new name
// is persisted!
dirF, err := os.Open(dir)
if err != nil {
return err
}
err = dirF.Sync()
closeErr := dirF.Close()
if err != nil {
return err
}
return closeErr
}
// Load loads the existing persisted state. If there is no existing state
// implementations are expected to create initialize new storage and return an
// empty state.
func (db *BoltMetaDB) Load(dir string) (types.PersistentState, error) {
var state types.PersistentState
if err := db.ensureOpen(dir); err != nil {
return state, err
}
tx, err := db.db.Begin(false)
if err != nil {
return state, err
}
defer tx.Rollback()
meta := tx.Bucket([]byte(MetaBucket))
// We just need one key for now so use the byte 'm' for meta arbitrarily.
raw := meta.Get([]byte(MetaKey))
if raw == nil {
// This is valid it's an "empty" log that will be initialized by the WAL.
return state, nil
}
if err := json.Unmarshal(raw, &state); err != nil {
return state, fmt.Errorf("%w: failed to parse persisted state: %s", types.ErrCorrupt, err)
}
return state, nil
}
// CommitState must atomically replace all persisted metadata in the current
// store with the set provided. It must not return until the data is persisted
// durably and in a crash-safe way otherwise the guarantees of the WAL will be
// compromised. The WAL will only ever call this in a single thread at one
// time and it will never be called concurrently with Load however it may be
// called concurrently with Get/SetStable operations.
func (db *BoltMetaDB) CommitState(state types.PersistentState) error {
if db.db == nil {
return ErrUnintialized
}
encoded, err := json.Marshal(state)
if err != nil {
return fmt.Errorf("failed to encode persisted state: %w", err)
}
tx, err := db.db.Begin(true)
if err != nil {
return err
}
defer tx.Rollback()
meta := tx.Bucket([]byte(MetaBucket))
if err := meta.Put([]byte(MetaKey), encoded); err != nil {
return err
}
return tx.Commit()
}
// GetStable returns a value from stable store or nil if it doesn't exist. May
// be called concurrently by multiple threads.
func (db *BoltMetaDB) GetStable(key []byte) ([]byte, error) {
if db.db == nil {
return nil, ErrUnintialized
}
tx, err := db.db.Begin(false)
if err != nil {
return nil, err
}
defer tx.Rollback()
stable := tx.Bucket([]byte(StableBucket))
val := stable.Get(key)
if val == nil {
return nil, nil
}
// Need to copy the value since bolt only guarantees the slice is valid until
// end of txn.
ret := make([]byte, len(val))
copy(ret, val)
return ret, nil
}
// SetStable stores a value from stable store. May be called concurrently with
// GetStable.
func (db *BoltMetaDB) SetStable(key []byte, value []byte) error {
if db.db == nil {
return ErrUnintialized
}
tx, err := db.db.Begin(true)
if err != nil {
return err
}
defer tx.Rollback()
stable := tx.Bucket([]byte(StableBucket))
if value == nil {
err = stable.Delete(key)
} else {
err = stable.Put(key, value)
}
if err != nil {
return err
}
return tx.Commit()
}
// Close implements io.Closer
func (db *BoltMetaDB) Close() error {
if db.db == nil {
return nil
}
err := db.db.Close()
db.db = nil
return err
}

78
vendor/github.com/hashicorp/raft-wal/metrics.go generated vendored Normal file
View File

@ -0,0 +1,78 @@
// Copyright (c) HashiCorp, Inc
// SPDX-License-Identifier: MPL-2.0
package wal
import (
"github.com/hashicorp/raft-wal/metrics"
)
var (
// MetricDefinitions describe the metrics emitted by this library via the
// provided metrics.Collector implementation. It's public so that these can be
// registered during init with metrics clients that support pre-defining
// metrics.
MetricDefinitions = metrics.Definitions{
Counters: []metrics.Descriptor{
{
Name: "log_entry_bytes_written",
Desc: "log_entry_bytes_written counts the bytes of log entry after encoding" +
" with Codec. Actual bytes written to disk might be slightly higher as it" +
" includes headers and index entries.",
},
{
Name: "log_entries_written",
Desc: "log_entries_written counts the number of entries written.",
},
{
Name: "log_appends",
Desc: "log_appends counts the number of calls to StoreLog(s) i.e." +
" number of batches of entries appended.",
},
{
Name: "log_entry_bytes_read",
Desc: "log_entry_bytes_read counts the bytes of log entry read from" +
" segments before decoding. actual bytes read from disk might be higher" +
" as it includes headers and index entries and possible secondary reads" +
" for large entries that don't fit in buffers.",
},
{
Name: "log_entries_read",
Desc: "log_entries_read counts the number of calls to get_log.",
},
{
Name: "segment_rotations",
Desc: "segment_rotations counts how many times we move to a new segment file.",
},
{
Name: "head_truncations",
Desc: "head_truncations counts how many log entries have been truncated" +
" from the head - i.e. the oldest entries. by graphing the rate of" +
" change over time you can see individual truncate calls as spikes.",
},
{
Name: "tail_truncations",
Desc: "tail_truncations counts how many log entries have been truncated" +
" from the head - i.e. the newest entries. by graphing the rate of" +
" change over time you can see individual truncate calls as spikes.",
},
{
Name: "stable_gets",
Desc: "stable_gets counts how many calls to StableStore.Get or GetUint64.",
},
{
Name: "stable_sets",
Desc: "stable_sets counts how many calls to StableStore.Set or SetUint64.",
},
},
Gauges: []metrics.Descriptor{
{
Name: "last_segment_age_seconds",
Desc: "last_segment_age_seconds is a gauge that is set each time we" +
" rotate a segment and describes the number of seconds between when" +
" that segment file was first created and when it was sealed. this" +
" gives a rough estimate how quickly writes are filling the disk.",
},
},
}
)

View File

@ -0,0 +1,89 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package metrics
import "sync/atomic"
var (
_ Collector = &AtomicCollector{}
)
// AtomicCollector is a simple Collector that atomically stores
// counters and gauges in memory.
type AtomicCollector struct {
counters []uint64
gauges []uint64
counterIndex, gaugeIndex map[string]int
}
// NewAtomicCollector creates a collector for the given set of Definitions.
func NewAtomicCollector(defs Definitions) *AtomicCollector {
c := &AtomicCollector{
counters: make([]uint64, len(defs.Counters)),
gauges: make([]uint64, len(defs.Gauges)),
counterIndex: make(map[string]int),
gaugeIndex: make(map[string]int),
}
for i, d := range defs.Counters {
if _, ok := c.counterIndex[d.Name]; ok {
panic("duplicate metrics named " + d.Name)
}
c.counterIndex[d.Name] = i
}
for i, d := range defs.Gauges {
if _, ok := c.counterIndex[d.Name]; ok {
panic("duplicate metrics named " + d.Name)
}
if _, ok := c.gaugeIndex[d.Name]; ok {
panic("duplicate metrics named " + d.Name)
}
c.gaugeIndex[d.Name] = i
}
return c
}
// IncrementCounter record val occurrences of the named event. Names will
// follow prometheus conventions with lower_case_and_underscores. We don't
// need any additional labels currently.
func (c *AtomicCollector) IncrementCounter(name string, delta uint64) {
id, ok := c.counterIndex[name]
if !ok {
panic("invalid metric name: " + name)
}
atomic.AddUint64(&c.counters[id], delta)
}
// SetGauge sets the value of the named gauge overriding any previous value.
func (c *AtomicCollector) SetGauge(name string, val uint64) {
id, ok := c.gaugeIndex[name]
if !ok {
panic("invalid metric name: " + name)
}
atomic.StoreUint64(&c.gauges[id], val)
}
// Summary returns a summary of the metrics since startup. Each value is
// atomically loaded but the set is not atomic overall and may represent an
// inconsistent snapshot e.g. with some metrics reflecting the most recent
// operation while others don't.
func (c *AtomicCollector) Summary() Summary {
s := Summary{
Counters: make(map[string]uint64, len(c.counters)),
Gauges: make(map[string]uint64, len(c.gauges)),
}
for name, id := range c.counterIndex {
s.Counters[name] = atomic.LoadUint64(&c.counters[id])
}
for name, id := range c.gaugeIndex {
s.Gauges[name] = atomic.LoadUint64(&c.gauges[id])
}
return s
}
// Summary is a copy of the values recorded so far for each metric.
type Summary struct {
Counters map[string]uint64
Gauges map[string]uint64
}

View File

@ -0,0 +1,85 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
// # Metrics Configuration
//
// The raft-wal library is instrumented to be able to use different metrics collectors. There are currently two implemented within this package:
// - atomic
// - go-metrics
//
// # go-metrics Compatibility
//
// This library can emit metrics using either github.com/armon/go-metrics or github.com/hashicorp/go-metrics. Choosing between the libraries is controlled via build tags.
//
// Build Tags:
// - armonmetrics - Using this tag will cause metrics to be routed to armon/go-metrics
// - hashicorpmetrics - Using this tag will cause all metrics to be routed to hashicorp/go-metrics
//
// If no build tag is specified, the default behavior is to use armon/go-metrics.
//
// # Deprecating armon/go-metrics
//
// Emitting metrics to armon/go-metrics is officially deprecated. Usage of armon/go-metrics will remain the default until mid-2025 with opt-in support continuing to the end of 2025.
//
// Migration:
// To migrate an application currently using the older armon/go-metrics to instead use hashicorp/go-metrics the following should be done.
//
// 1. Upgrade libraries using armon/go-metrics to consume hashicorp/go-metrics/compat instead. This should involve only changing import statements. All repositories within the hashicorp GitHub organization will be getting these updates in early 2025.
//
// 2. Update an applications library dependencies to those that have the compatibility layer configured.
//
// 3. Update the application to use hashicorp/go-metrics for configuring metrics export instead of armon/go-metrics
//
// - Replace all application imports of github.com/armon/go-metrics with github.com/hashicorp/go-metrics
//
// - Instrument your build system to build with the hashicorpmetrics tag.
//
// Eventually once the default behavior changes to use hashicorp/go-metrics by default (mid-2025), you can drop the hashicorpmetrics build tag.
package metrics
import gometrics "github.com/hashicorp/go-metrics/compat"
// GoMetricsCollector implements a Collector that passes through observations to
// a go-metrics instance. The zero value works, writing metrics to the default
// global instance however to set a prefix or a static set of labels to add to
// each metric observed, or to use a non-global metrics instance use
// NewGoMetricsCollector.
type GoMetricsCollector struct {
gm *gometrics.Metrics
prefix []string
labels []gometrics.Label
}
// NewGoMetricsCollector returns a GoMetricsCollector that will attach the
// specified name prefix and/or labels to each observation. If gm is nil the
// global metrics instance is used.
func NewGoMetricsCollector(prefix []string, labels []gometrics.Label, gm *gometrics.Metrics) *GoMetricsCollector {
if gm == nil {
gm = gometrics.Default()
}
return &GoMetricsCollector{
gm: gm,
prefix: prefix,
labels: labels,
}
}
// IncrementCounter record val occurrences of the named event. Names will
// follow prometheus conventions with lower_case_and_underscores. We don't
// need any additional labels currently.
func (c *GoMetricsCollector) IncrementCounter(name string, delta uint64) {
c.gm.IncrCounterWithLabels(c.name(name), float32(delta), c.labels)
}
// SetGauge sets the value of the named gauge overriding any previous value.
func (c *GoMetricsCollector) SetGauge(name string, val uint64) {
c.gm.SetGaugeWithLabels(c.name(name), float32(val), c.labels)
}
// name returns the metric name as a slice we don't want to risk modifying the
// prefix slice backing array since this might be called concurrently so we
// always allocate a new slice.
func (c *GoMetricsCollector) name(name string) []string {
var ss []string
return append(append(ss, c.prefix...), name)
}

View File

@ -0,0 +1,42 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package metrics
// Collector provides a simple abstraction for counter type metrics that
// the WAL and log verifier can use without depending on a specific metrics
// collector implementation.
type Collector interface {
// IncrementCounter record val occurrences of the named event. Names will
// follow prometheus conventions with lower_case_and_underscores. We don't
// need any additional labels currently.
IncrementCounter(name string, delta uint64)
// SetGauge sets the value of the named gauge overriding any previous value.
SetGauge(name string, val uint64)
}
// Definitions provides a simple description of a set of scalar metrics.
type Definitions struct {
Counters []Descriptor
Gauges []Descriptor
}
// Descriptor describes a specific metric.
type Descriptor struct {
Name string
Desc string
}
var _ Collector = &NoOpCollector{}
// NoOpCollector is a Collector that does nothing.
type NoOpCollector struct{}
// IncrementCounter record val occurrences of the named event. Names will
// follow prometheus conventions with lower_case_and_underscores. We don't
// need any additional labels currently.
func (c *NoOpCollector) IncrementCounter(name string, delta uint64) {}
// SetGauge sets the value of the named gauge overriding any previous value.
func (c *NoOpCollector) SetGauge(name string, val uint64) {}

92
vendor/github.com/hashicorp/raft-wal/options.go generated vendored Normal file
View File

@ -0,0 +1,92 @@
// Copyright (c) HashiCorp, Inc
// SPDX-License-Identifier: MPL-2.0
package wal
import (
"fmt"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/raft-wal/fs"
"github.com/hashicorp/raft-wal/metadb"
"github.com/hashicorp/raft-wal/metrics"
"github.com/hashicorp/raft-wal/segment"
"github.com/hashicorp/raft-wal/types"
)
// WithCodec is an option that allows a custom Codec to be provided to the WAL.
// If not used the default Codec is used.
func WithCodec(c Codec) walOpt {
return func(w *WAL) {
w.codec = c
}
}
// WithMetaStore is an option that allows a custom MetaStore to be provided to
// the WAL. If not used the default MetaStore is used.
func WithMetaStore(db types.MetaStore) walOpt {
return func(w *WAL) {
w.metaDB = db
}
}
// WithSegmentFiler is an option that allows a custom SegmentFiler (and hence
// Segment Reader/Writer implementation) to be provided to the WAL. If not used
// the default SegmentFiler is used.
func WithSegmentFiler(sf types.SegmentFiler) walOpt {
return func(w *WAL) {
w.sf = sf
}
}
// WithLogger is an option that allows a custom logger to be used.
func WithLogger(logger hclog.Logger) walOpt {
return func(w *WAL) {
w.log = logger
}
}
// WithSegmentSize is an option that allows a custom segmentSize to be set.
func WithSegmentSize(size int) walOpt {
return func(w *WAL) {
w.segmentSize = size
}
}
// WithMetricsCollector is an option that allows a custom segmentSize to be set.
func WithMetricsCollector(c metrics.Collector) walOpt {
return func(w *WAL) {
w.metrics = c
}
}
func (w *WAL) applyDefaultsAndValidate() error {
// Check if an external codec has been used that it's not using a reserved ID.
if w.codec != nil && w.codec.ID() < FirstExternalCodecID {
return fmt.Errorf("codec is using a reserved ID (below %d)", FirstExternalCodecID)
}
// Defaults
if w.log == nil {
w.log = hclog.Default().Named("wal")
}
if w.codec == nil {
w.codec = &BinaryCodec{}
}
if w.sf == nil {
// These are not actually swappable via options right now but we override
// them in tests. Only load the default implementations if they are not set.
vfs := fs.New()
w.sf = segment.NewFiler(w.dir, vfs)
}
if w.metrics == nil {
w.metrics = &metrics.NoOpCollector{}
}
if w.metaDB == nil {
w.metaDB = &metadb.BoltMetaDB{}
}
if w.segmentSize == 0 {
w.segmentSize = DefaultSegmentSize
}
return nil
}

14
vendor/github.com/hashicorp/raft-wal/segment/crc.go generated vendored Normal file
View File

@ -0,0 +1,14 @@
// Copyright (c) HashiCorp, Inc
// SPDX-License-Identifier: MPL-2.0
package segment
import (
"hash/crc32"
)
var castagnoliTable *crc32.Table
func init() {
castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
}

295
vendor/github.com/hashicorp/raft-wal/segment/filer.go generated vendored Normal file
View File

@ -0,0 +1,295 @@
// Copyright (c) HashiCorp, Inc
// SPDX-License-Identifier: MPL-2.0
package segment
import (
"errors"
"fmt"
"io"
"strings"
"sync"
"github.com/hashicorp/raft-wal/types"
)
const (
segmentFileSuffix = ".wal"
segmentFileNamePattern = "%020d-%016x" + segmentFileSuffix
)
// Filer implements the abstraction for managing a set of segment files in a
// directory. It uses a VFS to abstract actual file system operations for easier
// testing.
type Filer struct {
dir string
vfs types.VFS
bufPool sync.Pool
}
// NewFiler creates a Filer ready for use.
func NewFiler(dir string, vfs types.VFS) *Filer {
f := &Filer{
dir: dir,
vfs: vfs,
}
f.bufPool.New = func() interface{} {
return make([]byte, minBufSize)
}
return f
}
// FileName returns the formatted file name expected for this segment.
// SegmentFiler implementations could choose to ignore this but it's here to
func FileName(i types.SegmentInfo) string {
return fmt.Sprintf(segmentFileNamePattern, i.BaseIndex, i.ID)
}
// Create adds a new segment with the given info and returns a writer or an
// error.
func (f *Filer) Create(info types.SegmentInfo) (types.SegmentWriter, error) {
if info.BaseIndex == 0 {
return nil, fmt.Errorf("BaseIndex must be greater than zero")
}
fname := FileName(info)
wf, err := f.vfs.Create(f.dir, fname, uint64(info.SizeLimit))
if err != nil {
return nil, err
}
return createFile(info, wf, &f.bufPool)
}
// RecoverTail is called on an unsealed segment when re-opening the WAL it will
// attempt to recover from a possible crash. It will either return an error, or
// return a valid segmentWriter that is ready for further appends. If the
// expected tail segment doesn't exist it must return an error wrapping
// os.ErrNotExist.
func (f *Filer) RecoverTail(info types.SegmentInfo) (types.SegmentWriter, error) {
fname := FileName(info)
wf, err := f.vfs.OpenWriter(f.dir, fname)
if err != nil {
return nil, err
}
return recoverFile(info, wf, &f.bufPool)
}
// Open an already sealed segment for reading. Open may validate the file's
// header and return an error if it doesn't match the expected info.
func (f *Filer) Open(info types.SegmentInfo) (types.SegmentReader, error) {
fname := FileName(info)
rf, err := f.vfs.OpenReader(f.dir, fname)
if err != nil {
return nil, err
}
// Validate header here since openReader is re-used by writer where it's valid
// for the file header not to be committed yet after a crash so we can't check
// it there.
var hdr [fileHeaderLen]byte
if _, err := rf.ReadAt(hdr[:], 0); err != nil {
if errors.Is(err, io.EOF) {
// Treat failure to read a header as corruption since a sealed file should
// never not have a valid header. (I.e. even if crashes happen it should
// be impossible to seal a segment with no header written so this
// indicates that something truncated the file after the fact)
return nil, fmt.Errorf("%w: failed to read header: %s", types.ErrCorrupt, err)
}
return nil, err
}
gotInfo, err := readFileHeader(hdr[:])
if err != nil {
return nil, err
}
if err := validateFileHeader(*gotInfo, info); err != nil {
return nil, err
}
return openReader(info, rf, &f.bufPool)
}
// List returns the set of segment IDs currently stored. It's used by the WAL
// on recovery to find any segment files that need to be deleted following a
// unclean shutdown. The returned map is a map of ID -> BaseIndex. BaseIndex
// is returned to allow subsequent Delete calls to be made.
func (f *Filer) List() (map[uint64]uint64, error) {
segs, _, err := f.listInternal()
return segs, err
}
func (f *Filer) listInternal() (map[uint64]uint64, []uint64, error) {
files, err := f.vfs.ListDir(f.dir)
if err != nil {
return nil, nil, err
}
segs := make(map[uint64]uint64)
sorted := make([]uint64, 0)
for _, file := range files {
if !strings.HasSuffix(file, segmentFileSuffix) {
continue
}
// Parse BaseIndex and ID from the file name
var bIdx, id uint64
n, err := fmt.Sscanf(file, segmentFileNamePattern, &bIdx, &id)
if err != nil {
return nil, nil, types.ErrCorrupt
}
if n != 2 {
// Misnamed segment files with the right suffix indicates a bug or
// tampering, we can't be sure what's happened to the data.
return nil, nil, types.ErrCorrupt
}
segs[id] = bIdx
sorted = append(sorted, id)
}
return segs, sorted, nil
}
// Delete removes the segment with given baseIndex and id if it exists. Note
// that baseIndex is technically redundant since ID is unique on it's own. But
// in practice we name files (or keys) with both so that they sort correctly.
// This interface allows a simpler implementation where we can just delete
// the file if it exists without having to scan the underlying storage for a.
func (f *Filer) Delete(baseIndex uint64, ID uint64) error {
fname := fmt.Sprintf(segmentFileNamePattern, baseIndex, ID)
return f.vfs.Delete(f.dir, fname)
}
// DumpSegment attempts to read the segment file specified by the baseIndex and
// ID. It's intended purpose is for debugging the contents of segment files and
// unlike the SegmentFiler interface, it doesn't assume the caller has access to
// the correct metadata. This allows dumping log segments in a WAL that is still
// being written to by another process. Without metadata we don't know if the
// file is sealed so always recover by reading through the whole file. If after
// or before are non-zero, the specify a exclusive lower or upper bound on which
// log entries should be emitted. No error checking is done on the read data. fn
// is called for each entry passing the raft info read from the file header (so
// that the caller knows which codec to use for example) the raft index of the
// entry and the raw bytes of the entry itself. The callback must return true to
// continue reading. The data slice is only valid for the lifetime of the call.
func (f *Filer) DumpSegment(baseIndex uint64, ID uint64, after, before uint64, fn func(info types.SegmentInfo, e types.LogEntry) (bool, error)) error {
fname := fmt.Sprintf(segmentFileNamePattern, baseIndex, ID)
rf, err := f.vfs.OpenReader(f.dir, fname)
if err != nil {
return err
}
buf := make([]byte, 64*1024)
idx := baseIndex
type frameInfo struct {
Index uint64
Offset int64
Len uint32
}
var batch []frameInfo
_, err = readThroughSegment(rf, func(info types.SegmentInfo, fh frameHeader, offset int64) (bool, error) {
if fh.typ == FrameCommit {
// All the previous entries have been committed. Read them and send up to
// caller.
for _, frame := range batch {
// Check the header is reasonable
if frame.Len > MaxEntrySize {
return false, fmt.Errorf("failed to read entry idx=%d, frame header length (%d) is too big: %w",
frame.Index, frame.Len, err)
}
if frame.Len > uint32(len(buf)) {
buf = make([]byte, frame.Len)
}
n, err := rf.ReadAt(buf[:frame.Len], frame.Offset+frameHeaderLen)
if err != nil {
return false, err
}
if uint32(n) < frame.Len {
return false, io.ErrUnexpectedEOF
}
ok, err := fn(info, types.LogEntry{Index: frame.Index, Data: buf[:n]})
if !ok || err != nil {
return ok, err
}
}
// Reset batch
batch = batch[:0]
return true, nil
}
if fh.typ != FrameEntry {
return true, nil
}
if idx <= after {
// Not in the range we care about, skip reading the entry.
idx++
return true, nil
}
if before > 0 && idx >= before {
// We're done
return false, nil
}
batch = append(batch, frameInfo{idx, offset, fh.len})
idx++
return true, nil
})
return err
}
// DumpLogs attempts to read all log entries from segment files in the directory
// for debugging purposes. It does _not_ use the metadata and so may output log
// entries that are uncommitted or already truncated as far as the writing
// process is concerned. As such it should not be used for replication of data.
// It is useful though to debug the contents of the log even while the writing
// application is still running. After and before if non-zero specify exclusive
// bounds on the logs that should be returned which may allow the implementation
// to skip reading entire segment files that are not in the range.
func (f *Filer) DumpLogs(after, before uint64, fn func(info types.SegmentInfo, e types.LogEntry) (bool, error)) error {
baseIndexes, segIDsSorted, err := f.listInternal()
if err != nil {
return err
}
for i, id := range segIDsSorted {
baseIndex := baseIndexes[id]
nextBaseIndex := uint64(0)
if i+1 < len(segIDsSorted) {
// This is not the last segment, peek at the base index of that one and
// assume that this segment won't contain indexes that high.
nextBaseIndex = baseIndexes[segIDsSorted[i+1]]
}
// See if this file contains any indexes in the range
if after > 0 && nextBaseIndex > 0 && after >= nextBaseIndex {
// This segment is all indexes before the lower bound we care about
continue
}
if before > 0 && before <= baseIndex {
// This segment is all indexes higher than the upper bound. We've output
// every log in the range at this point (barring edge cases where we race
// with a truncation which leaves multiple generations of segment files on
// disk which we are going to ignore for now).
return nil
}
// We probably care about at least some of the entries in this segment
err := f.DumpSegment(baseIndex, id, after, before, fn)
if err != nil {
return err
}
}
return nil
}

252
vendor/github.com/hashicorp/raft-wal/segment/format.go generated vendored Normal file
View File

@ -0,0 +1,252 @@
// Copyright (c) HashiCorp, Inc
// SPDX-License-Identifier: MPL-2.0
package segment
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"github.com/hashicorp/raft-wal/types"
)
const (
// MaxEntrySize is the largest we allow any single raft log entry to be. This
// is larger than our raft implementation ever allows so seems safe to encode
// statically for now. We could make this configurable. It's main purpose it
// to limit allocation when reading entries back if their lengths are
// corrupted.
MaxEntrySize = 64 * 1024 * 1024 // 64 MiB
// minBufSize is the size we allocate read and write buffers. Setting it
// larger wastes more memory but increases the chances that we'll read the
// whole frame in a single shot and not need a second allocation and trip to
// the disk.
minBufSize = 64 * 1024
fileHeaderLen = 32
version = 0
magic = 0x58eb6b0d
// Note that this must remain a power of 2 to ensure aligning to this also
// aligns to sector boundaries.
frameHeaderLen = 8
)
const ( // Start iota from 0
FrameInvalid uint8 = iota
FrameEntry
FrameIndex
FrameCommit
)
var (
// ErrTooBig indicates that the caller tried to write a logEntry with a
// payload that's larger than we are prepared to support.
ErrTooBig = errors.New("entries larger than 64MiB are not supported")
)
/*
File Header functions
0 1 2 3 4 5 6 7 8
+------+------+------+------+------+------+------+------+
| Magic | Reserved | Vsn |
+------+------+------+------+------+------+------+------+
| BaseIndex |
+------+------+------+------+------+------+------+------+
| SegmentID |
+------+------+------+------+------+------+------+------+
| Codec |
+------+------+------+------+------+------+------+------+
*/
// writeFileHeader writes a file header into buf for the given file metadata.
func writeFileHeader(buf []byte, info types.SegmentInfo) error {
if len(buf) < fileHeaderLen {
return io.ErrShortBuffer
}
binary.LittleEndian.PutUint32(buf[0:4], magic)
// Explicitly zero Reserved bytes just in case
buf[4] = 0
buf[5] = 0
buf[6] = 0
buf[7] = version
binary.LittleEndian.PutUint64(buf[8:16], info.BaseIndex)
binary.LittleEndian.PutUint64(buf[16:24], info.ID)
binary.LittleEndian.PutUint64(buf[24:32], info.Codec)
return nil
}
// readFileHeader reads a file header from buf.
func readFileHeader(buf []byte) (*types.SegmentInfo, error) {
if len(buf) < fileHeaderLen {
return nil, io.ErrShortBuffer
}
var i types.SegmentInfo
m := binary.LittleEndian.Uint64(buf[0:8])
if m != magic {
return nil, types.ErrCorrupt
}
if buf[7] != version {
return nil, types.ErrCorrupt
}
i.BaseIndex = binary.LittleEndian.Uint64(buf[8:16])
i.ID = binary.LittleEndian.Uint64(buf[16:24])
i.Codec = binary.LittleEndian.Uint64(buf[24:32])
return &i, nil
}
func validateFileHeader(got, expect types.SegmentInfo) error {
if expect.ID != got.ID {
return fmt.Errorf("%w: segment header ID %x doesn't match metadata %x",
types.ErrCorrupt, got.ID, expect.ID)
}
if expect.BaseIndex != got.BaseIndex {
return fmt.Errorf("%w: segment header BaseIndex %d doesn't match metadata %d",
types.ErrCorrupt, got.BaseIndex, expect.BaseIndex)
}
if expect.Codec != got.Codec {
return fmt.Errorf("%w: segment header Codec %d doesn't match metadata %d",
types.ErrCorrupt, got.Codec, expect.Codec)
}
return nil
}
/*
Frame Functions
0 1 2 3 4 5 6 7 8
+------+------+------+------+------+------+------+------+
| Type | Reserved | Length/CRC |
+------+------+------+------+------+------+------+------+
*/
type frameHeader struct {
typ uint8
len uint32
crc uint32
}
func writeFrame(buf []byte, h frameHeader, payload []byte) error {
if len(buf) < encodedFrameSize(int(h.len)) {
return io.ErrShortBuffer
}
if err := writeFrameHeader(buf, h); err != nil {
return err
}
copy(buf[frameHeaderLen:], payload[:h.len])
// Explicitly write null bytes for padding
padBytes := padLen(int(h.len))
for i := 0; i < padBytes; i++ {
buf[frameHeaderLen+int(h.len)+i] = 0x0
}
return nil
}
func writeFrameHeader(buf []byte, h frameHeader) error {
if len(buf) < frameHeaderLen {
return io.ErrShortBuffer
}
buf[0] = h.typ
buf[1] = 0
buf[2] = 0
buf[3] = 0
lOrCRC := h.len
if h.typ == FrameCommit {
lOrCRC = h.crc
}
binary.LittleEndian.PutUint32(buf[4:8], lOrCRC)
return nil
}
var zeroHeader [frameHeaderLen]byte
func readFrameHeader(buf []byte) (frameHeader, error) {
var h frameHeader
if len(buf) < frameHeaderLen {
return h, io.ErrShortBuffer
}
switch buf[0] {
default:
return h, fmt.Errorf("%w: corrupt frame header with unknown type %d", types.ErrCorrupt, buf[0])
case FrameInvalid:
// Check if the whole header is zero and return a zero frame as this could
// just indicate we've read right off the end of the written data during
// recovery.
if bytes.Equal(buf[:frameHeaderLen], zeroHeader[:]) {
return h, nil
}
return h, fmt.Errorf("%w: corrupt frame header with type 0 but non-zero other fields", types.ErrCorrupt)
case FrameEntry, FrameIndex:
h.typ = buf[0]
h.len = binary.LittleEndian.Uint32(buf[4:8])
case FrameCommit:
h.typ = buf[0]
h.crc = binary.LittleEndian.Uint32(buf[4:8])
}
return h, nil
}
// padLen returns how many bytes of padding should be added to a frame of length
// n to ensure it is a multiple of headerLen. We ensure frameHeaderLen is a
// power of two so that it's always a multiple of a typical sector size (e.g.
// 512 bytes) to reduce the risk that headers are torn by being written across
// sector boundaries. It will return an int in the range [0, 7].
func padLen(n int) int {
// This looks a bit awful but it's just doing (n % 8) and subtracting that
// from 8 to get the number of bytes extra needed to get up to the next 8-byte
// boundary. The extra & 7 is to handle the case where n is a multiple of 8
// already and so n%8 is 0 and 8-0 is 8. By &ing 8 (0b1000) with 7 (0b111) we
// effectively wrap it back around to 0. This only works as long as
// frameHeaderLen is a power of 2 but that's necessary per comment above.
return (frameHeaderLen - (n % frameHeaderLen)) & (frameHeaderLen - 1)
}
func encodedFrameSize(payloadLen int) int {
return frameHeaderLen + payloadLen + padLen(payloadLen)
}
func indexFrameSize(numEntries int) int {
// Index frames are completely unnecessary if the whole block is a
// continuation with no new entries.
if numEntries == 0 {
return 0
}
return encodedFrameSize(numEntries * 4)
}
func writeIndexFrame(buf []byte, offsets []uint32) error {
if len(buf) < indexFrameSize(len(offsets)) {
return io.ErrShortBuffer
}
fh := frameHeader{
typ: FrameIndex,
len: uint32(len(offsets) * 4),
}
if err := writeFrameHeader(buf, fh); err != nil {
return err
}
cursor := frameHeaderLen
for _, o := range offsets {
binary.LittleEndian.PutUint32(buf[cursor:], o)
cursor += 4
}
if (len(offsets) % 2) == 1 {
// Odd number of entries, zero pad to keep it 8-byte aligned
binary.LittleEndian.PutUint32(buf[cursor:], 0)
}
return nil
}

160
vendor/github.com/hashicorp/raft-wal/segment/reader.go generated vendored Normal file
View File

@ -0,0 +1,160 @@
// Copyright (c) HashiCorp, Inc
// SPDX-License-Identifier: MPL-2.0
package segment
import (
"encoding/binary"
"errors"
"fmt"
"io"
"sync"
"github.com/hashicorp/raft-wal/types"
)
// Reader allows reading logs from a segment file.
type Reader struct {
info types.SegmentInfo
rf types.ReadableFile
bufPool *sync.Pool
// tail optionally providers an interface to the writer state when this is an
// unsealed segment so we can fetch from it's in-memory index.
tail tailWriter
}
type tailWriter interface {
OffsetForFrame(idx uint64) (uint32, error)
}
func openReader(info types.SegmentInfo, rf types.ReadableFile, bufPool *sync.Pool) (*Reader, error) {
r := &Reader{
info: info,
rf: rf,
bufPool: bufPool,
}
return r, nil
}
// Close implements io.Closer
func (r *Reader) Close() error {
return r.rf.Close()
}
// GetLog returns the raw log entry bytes associated with idx. If the log
// doesn't exist in this segment types.ErrNotFound must be returned.
func (r *Reader) GetLog(idx uint64) (*types.PooledBuffer, error) {
offset, err := r.findFrameOffset(idx)
if err != nil {
return nil, err
}
_, payload, err := r.readFrame(offset)
if err != nil {
return nil, err
}
return payload, err
}
func (r *Reader) readFrame(offset uint32) (frameHeader, *types.PooledBuffer, error) {
buf := r.makeBuffer()
n, err := r.rf.ReadAt(buf.Bs, int64(offset))
if errors.Is(err, io.EOF) && n >= frameHeaderLen {
// We might have hit EOF just because our read buffer (at least 64KiB) might
// be larger than the space left in the file (say if files are tiny or if we
// are reading a frame near the end.). So don't treat EOF as an error as
// long as we have actually managed to read a frameHeader - we'll work out
// if we got the whole thing or not below.
err = nil
// Re-slice buf.Bs so it's len() reflect only what we actually managed to
// read. Note this doesn't impact the buffer length when it's returned to
// the pool which will still return the whole cap.
buf.Bs = buf.Bs[:n]
}
if err != nil {
return frameHeader{}, nil, err
}
fh, err := readFrameHeader(buf.Bs)
if err != nil {
return fh, nil, err
}
if (frameHeaderLen + int(fh.len)) <= len(buf.Bs) {
// We already have all we need read, just return it sliced to just include
// the payload.
buf.Bs = buf.Bs[frameHeaderLen : frameHeaderLen+fh.len]
return fh, buf, nil
}
// Need to read again, with a bigger buffer, return this one
buf.Close()
// Need to read more bytes, validate that len is a sensible number
if fh.len > MaxEntrySize {
return fh, nil, fmt.Errorf("%w: frame header indicates a record larger than MaxEntrySize (%d bytes)", types.ErrCorrupt, MaxEntrySize)
}
buf = &types.PooledBuffer{
Bs: make([]byte, fh.len),
// No closer, let outsized buffers be GCed in case they are massive and way
// bigger than we need again. Could reconsider this if we find we need to
// optimize for frequent > minBufSize reads.
}
if _, err := r.rf.ReadAt(buf.Bs, int64(offset+frameHeaderLen)); err != nil {
return fh, nil, err
}
return fh, buf, nil
}
func (r *Reader) makeBuffer() *types.PooledBuffer {
if r.bufPool == nil {
return &types.PooledBuffer{Bs: make([]byte, minBufSize)}
}
buf := r.bufPool.Get().([]byte)
return &types.PooledBuffer{
Bs: buf,
CloseFn: func() {
// Note we always return the whole allocated buf regardless of what Bs
// ended up being sliced to.
r.bufPool.Put(buf)
},
}
}
func (r *Reader) findFrameOffset(idx uint64) (uint32, error) {
if r.tail != nil {
// This is not a sealed segment.
return r.tail.OffsetForFrame(idx)
}
// Sealed segment, read from the on-disk index block.
if r.info.IndexStart == 0 {
return 0, fmt.Errorf("sealed segment has no index block")
}
if idx < r.info.MinIndex || (r.info.MaxIndex > 0 && idx > r.info.MaxIndex) {
return 0, types.ErrNotFound
}
// IndexStart is the offset to the first entry in the index array. We need to
// find the byte offset to the Nth entry
entryOffset := (idx - r.info.BaseIndex)
byteOffset := r.info.IndexStart + (entryOffset * 4)
var bs [4]byte
n, err := r.rf.ReadAt(bs[:], int64(byteOffset))
if err == io.EOF && n == 4 {
// Read all of it just happened to be at end of file, ignore
err = nil
}
if err != nil {
return 0, fmt.Errorf("failed to read segment index: %w", err)
}
offset := binary.LittleEndian.Uint32(bs[:])
return offset, nil
}

599
vendor/github.com/hashicorp/raft-wal/segment/writer.go generated vendored Normal file
View File

@ -0,0 +1,599 @@
// Copyright (c) HashiCorp, Inc
// SPDX-License-Identifier: MPL-2.0
package segment
import (
"fmt"
"hash/crc32"
"io"
"sync"
"sync/atomic"
"github.com/hashicorp/raft-wal/types"
)
// Writer allows appending logs to a segment file as well as reading them back.
type Writer struct {
// commitIdx is updated after an append batch is fully persisted to disk to
// allow readers to read the new value. Note that readers must not read values
// larger than this even if they are available in tailIndex as they are not
// yet committed to disk!
commitIdx uint64
// offsets is the index offset. The first element corresponds to the
// BaseIndex. It is accessed concurrently by readers and the single writer
// without locks! This is race-free via the following invariants:
// - the slice here is never mutated only copied though it may still refer to
// the same backing array.
// - readers only ever read up to len(offsets) in the atomically accessed
// slice. Those elements of the backing array are immutable and will never
// be modified once they are accessible to readers.
// - readers and writers synchronize on atomic access to the slice
// - serial writer will only append to the end which either mutates the
// shared backing array but at an index greater than the len any reader has
// seen, or a new backing array is allocated and the old one copied into it
// which also will never mutate the entries readers can already "see" via
// the old slice.
offsets atomic.Value // []uint32
// writer state is accessed only on the (serial) write path so doesn't need
// synchronization.
writer struct {
// commitBuf stores the pending frames waiting to be flushed to the current
// tail block.
commitBuf []byte
// crc is the rolling crc32 Castagnoli sum of all data written since the
// last fsync.
crc uint32
// writeOffset is the absolute file offset up to which we've written data to
// the file. The contents of commitBuf will be written at this offset when
// it commits or we reach the end of the block, whichever happens first.
writeOffset uint32
// indexStart is set when the tail is sealed indicating the file offset at
// which the index array was written.
indexStart uint64
}
info types.SegmentInfo
wf types.WritableFile
r types.SegmentReader
}
func createFile(info types.SegmentInfo, wf types.WritableFile, bufPool *sync.Pool) (*Writer, error) {
r, err := openReader(info, wf, bufPool)
if err != nil {
return nil, err
}
w := &Writer{
info: info,
wf: wf,
r: r,
}
r.tail = w
if err := w.initEmpty(); err != nil {
return nil, err
}
return w, nil
}
func recoverFile(info types.SegmentInfo, wf types.WritableFile, bufPool *sync.Pool) (*Writer, error) {
r, err := openReader(info, wf, bufPool)
if err != nil {
return nil, err
}
w := &Writer{
info: info,
wf: wf,
r: r,
}
r.tail = w
if err := w.recoverTail(); err != nil {
return nil, err
}
return w, nil
}
func (w *Writer) initEmpty() error {
// Write header into write buffer to be written out with the first commit.
w.writer.writeOffset = 0
w.ensureBufCap(fileHeaderLen)
w.writer.commitBuf = w.writer.commitBuf[:fileHeaderLen]
if err := writeFileHeader(w.writer.commitBuf, w.info); err != nil {
return err
}
w.writer.crc = crc32.Checksum(w.writer.commitBuf[:fileHeaderLen], castagnoliTable)
// Initialize the index
offsets := make([]uint32, 0, 32*1024)
w.offsets.Store(offsets)
return nil
}
func (w *Writer) recoverTail() error {
// We need to track the last two commit frames
type commitInfo struct {
fh frameHeader
offset int64
crcStart int64
offsetsLen int
}
var prevCommit, finalCommit *commitInfo
offsets := make([]uint32, 0, 32*1024)
readInfo, err := readThroughSegment(w.wf, func(_ types.SegmentInfo, fh frameHeader, offset int64) (bool, error) {
switch fh.typ {
case FrameEntry:
// Record the frame offset
offsets = append(offsets, uint32(offset))
case FrameIndex:
// So this segment was sealed! (or attempted) keep track of this
// indexStart in case it turns out the Seal actually committed completely.
// We store the start of the actual array not the frame header.
w.writer.indexStart = uint64(offset) + frameHeaderLen
case FrameCommit:
// The payload is not the length field in this case!
prevCommit = finalCommit
finalCommit = &commitInfo{
fh: fh,
offset: offset,
crcStart: 0, // First commit includes the file header
offsetsLen: len(offsets), // Track how many entries were found up to this commit point.
}
if prevCommit != nil {
finalCommit.crcStart = prevCommit.offset + frameHeaderLen
}
}
return true, nil
})
if err != nil {
return err
}
if finalCommit == nil {
// There were no commit frames found at all. This segment file is
// effectively empty. Init it that way ready for appending. This overwrites
// the file header so it doesn't matter if it was valid or not.
return w.initEmpty()
}
// Assume that the final commit is good for now and set the writer state
w.writer.writeOffset = uint32(finalCommit.offset + frameHeaderLen)
// Just store what we have for now to ensure the defer doesn't panic we'll
// probably update this below.
w.offsets.Store(offsets)
// Whichever path we take, fix up the commitIdx before we leave
defer func() {
ofs := w.getOffsets()
if len(ofs) > 0 {
// Non atomic is OK because this file is not visible to any other threads
// yet.
w.commitIdx = w.info.BaseIndex + uint64(len(ofs)) - 1
}
}()
if finalCommit.offsetsLen < len(offsets) {
// Some entries were found after the last commit. Those must be a partial
// write that was uncommitted so can be ignored. But the fact they were
// written at all means that the last commit frame must have been completed
// and acknowledged so we don't need to verify anything. Just truncate the
// extra entries from index and reset the write cursor to continue appending
// after the last commit.
offsets = offsets[:finalCommit.offsetsLen]
w.offsets.Store(offsets)
// Since at least one commit was found, the header better be valid!
return validateFileHeader(*readInfo, w.info)
}
// Last frame was a commit frame! Let's check that all the data written in
// that commit frame made it to disk.
// Verify the length first
bufLen := finalCommit.offset - finalCommit.crcStart
// We know bufLen can't be bigger than the whole segment file because none of
// the values above were read from the data just from the offsets we moved
// through.
batchBuf := make([]byte, bufLen)
if _, err := w.wf.ReadAt(batchBuf, finalCommit.crcStart); err != nil {
return fmt.Errorf("failed to read last committed batch for CRC validation: %w", err)
}
gotCrc := crc32.Checksum(batchBuf, castagnoliTable)
if gotCrc == finalCommit.fh.crc {
// All is good. We already setup the state we need for writer other than
// offsets.
w.offsets.Store(offsets)
// Since at least one commit was found, the header better be valid!
return validateFileHeader(*readInfo, w.info)
}
// Last commit was incomplete rewind back to the previous one or start of file
if prevCommit == nil {
// Init wil re-write the file header so it doesn't matter if it was corrupt
// or not!
return w.initEmpty()
}
w.writer.writeOffset = uint32(prevCommit.offset + frameHeaderLen)
offsets = offsets[:prevCommit.offsetsLen]
w.offsets.Store(offsets)
// Since at least one commit was found, the header better be valid!
return validateFileHeader(*readInfo, w.info)
}
// Close implements io.Closer
func (w *Writer) Close() error {
return w.r.Close()
}
// GetLog implements types.SegmentReader
func (w *Writer) GetLog(idx uint64) (*types.PooledBuffer, error) {
return w.r.GetLog(idx)
}
// Append adds one or more entries. It must not return until the entries are
// durably stored otherwise raft's guarantees will be compromised.
func (w *Writer) Append(entries []types.LogEntry) error {
if len(entries) < 1 {
return nil
}
if w.writer.indexStart > 0 {
return types.ErrSealed
}
flushed := false
// Save any state we may need to rollback.
beforeBuf := w.writer.commitBuf
beforeCRC := w.writer.crc
beforeIndexStart := w.writer.indexStart
beforeWriteOffset := w.writer.writeOffset
beforeOffsets := w.offsets.Load()
defer func() {
if !flushed {
// rollback writer state on error
w.writer.commitBuf = beforeBuf
w.writer.crc = beforeCRC
w.writer.indexStart = beforeIndexStart
w.writer.writeOffset = beforeWriteOffset
w.offsets.Store(beforeOffsets)
}
}()
// Iterate entries and append each one
for _, e := range entries {
if err := w.appendEntry(e); err != nil {
return err
}
}
ofs := w.getOffsets()
// Work out if we need to seal before we commit and sync.
if (w.writer.writeOffset + uint32(len(w.writer.commitBuf)+indexFrameSize(len(ofs)))) > w.info.SizeLimit {
// Seal the segment! We seal it by writing an index frame before we commit.
if err := w.appendIndex(); err != nil {
return err
}
}
// Write the commit frame
if err := w.appendCommit(); err != nil {
return err
}
flushed = true
// Commit in-memory
atomic.StoreUint64(&w.commitIdx, entries[len(entries)-1].Index)
return nil
}
func (w *Writer) getOffsets() []uint32 {
return w.offsets.Load().([]uint32)
}
// OffsetForFrame implements tailWriter and allows readers to lookup entry
// frames in the tail's in-memory index.
func (w *Writer) OffsetForFrame(idx uint64) (uint32, error) {
if idx < w.info.BaseIndex || idx < w.info.MinIndex || idx > w.LastIndex() {
return 0, types.ErrNotFound
}
os := w.getOffsets()
entryIndex := idx - w.info.BaseIndex
// No bounds check on entryIndex since LastIndex must ensure it's in bounds.
return os[entryIndex], nil
}
func (w *Writer) appendEntry(e types.LogEntry) error {
offsets := w.getOffsets()
// Check the invariant that this entry is the next one we expect otherwise our
// index logic is incorrect and will result in panics on read.
if e.Index != w.info.BaseIndex+uint64(len(offsets)) {
return fmt.Errorf("non-monotonic append to segment with BaseIndex=%d. Entry index %d, expected %d",
w.info.BaseIndex, e.Index, w.info.BaseIndex+uint64(len(offsets)))
}
fh := frameHeader{
typ: FrameEntry,
len: uint32(len(e.Data)),
}
bufOffset, err := w.appendFrame(fh, e.Data)
if err != nil {
return err
}
// Update the offsets index
// Add the index entry. Note this is safe despite mutating the same backing
// array as tail because it's beyond the limit current readers will access
// until we do the atomic update below. Even if append re-allocates the
// backing array, it will only read the indexes smaller than numEntries from
// the old array to copy them into the new one and we are not mutating the
// same memory locations. Old readers might still be looking at the old
// array (lower than numEntries) through the current tail.offsets slice but
// we are not touching that at least below numEntries.
offsets = append(offsets, w.writer.writeOffset+uint32(bufOffset))
// Now we can make it available to readers. Note that readers still
// shouldn't read it until we actually commit to disk (and increment
// commitIdx) but it's race free for them to now!
w.offsets.Store(offsets)
return nil
}
func (w *Writer) appendCommit() error {
fh := frameHeader{
typ: FrameCommit,
crc: w.writer.crc,
}
if _, err := w.appendFrame(fh, nil); err != nil {
return err
}
// Flush all writes to the file
if err := w.sync(); err != nil {
return err
}
// Finally, reset crc so that by the time we write the next trailer
// we'll know where the append batch started.
w.writer.crc = 0
return nil
}
func (w *Writer) ensureBufCap(extraLen int) {
needCap := len(w.writer.commitBuf) + extraLen
if cap(w.writer.commitBuf) < needCap {
newSize := minBufSize
// Double buffer size until it's big enough to amortize cost
for newSize < needCap {
newSize = newSize * 2
}
newBuf := make([]byte, newSize)
oldLen := len(w.writer.commitBuf)
copy(newBuf, w.writer.commitBuf)
w.writer.commitBuf = newBuf[:oldLen]
}
}
func (w *Writer) appendIndex() error {
// Append the index record before we commit (commit and flush happen later
// generally)
offsets := w.getOffsets()
l := indexFrameSize(len(offsets))
w.ensureBufCap(l)
startOff := len(w.writer.commitBuf)
if err := writeIndexFrame(w.writer.commitBuf[startOff:startOff+l], offsets); err != nil {
return err
}
w.writer.commitBuf = w.writer.commitBuf[:startOff+l]
// Update crc with those values
w.writer.crc = crc32.Update(w.writer.crc, castagnoliTable, w.writer.commitBuf[startOff:startOff+l])
// Record the file offset where the index starts (the actual index data so
// after the frame header).
w.writer.indexStart = uint64(w.writer.writeOffset) + uint64(startOff+frameHeaderLen)
return nil
}
// appendFrame appends the given frame to the current block. The frame must fit
// already otherwise an error will be returned.
func (w *Writer) appendFrame(fh frameHeader, data []byte) (int, error) {
// Encode frame header into current block buffer
l := encodedFrameSize(len(data))
w.ensureBufCap(l)
bufOffset := len(w.writer.commitBuf)
if err := writeFrame(w.writer.commitBuf[bufOffset:bufOffset+l], fh, data); err != nil {
return 0, err
}
// Update len of commitBuf since we resliced it for the write
w.writer.commitBuf = w.writer.commitBuf[:bufOffset+l]
// Update the CRC
w.writer.crc = crc32.Update(w.writer.crc, castagnoliTable, w.writer.commitBuf[bufOffset:bufOffset+l])
return bufOffset, nil
}
func (w *Writer) flush() error {
// Write to file
n, err := w.wf.WriteAt(w.writer.commitBuf, int64(w.writer.writeOffset))
if err == io.EOF && n == len(w.writer.commitBuf) {
// Writer may return EOF even if it wrote all bytes if it wrote right up to
// the end of the file. Ignore that case though.
err = nil
}
if err != nil {
return err
}
// Reset writer state ready for next writes
w.writer.writeOffset += uint32(len(w.writer.commitBuf))
w.writer.commitBuf = w.writer.commitBuf[:0]
return nil
}
func (w *Writer) sync() error {
// Write out current buffer to file
if err := w.flush(); err != nil {
return err
}
// Sync file
if err := w.wf.Sync(); err != nil {
return err
}
// Update commitIdx atomically
offsets := w.getOffsets()
commitIdx := uint64(0)
if len(offsets) > 0 {
// Probably not possible for the to be less, but just in case we ever flush
// the file with only meta data written...
commitIdx = uint64(w.info.BaseIndex) + uint64(len(offsets)) - 1
}
atomic.StoreUint64(&w.commitIdx, commitIdx)
return nil
}
// Sealed returns whether the segment is sealed or not. If it is it returns
// true and the file offset that it's index array starts at to be saved in
// meta data. WAL will call this after every append so it should be relatively
// cheap in the common case. This design allows the final Append to write out
// the index or any additional data needed at seal time in the same fsync.
func (w *Writer) Sealed() (bool, uint64, error) {
if w.writer.indexStart == 0 {
return false, 0, nil
}
return true, w.writer.indexStart, nil
}
// ForceSeal forces us to seal the segment by writing out an index block
// wherever we got to in the file. After calling this it is no longer valid to
// call Append on this file.
func (w *Writer) ForceSeal() (uint64, error) {
if w.writer.indexStart > 0 {
// Already sealed, this is a no-op.
return w.writer.indexStart, nil
}
// Seal the segment! We seal it by writing an index frame before we commit.
if err := w.appendIndex(); err != nil {
return 0, err
}
// Write the commit frame
if err := w.appendCommit(); err != nil {
return 0, err
}
return w.writer.indexStart, nil
}
// LastIndex returns the most recently persisted index in the log. It must
// respond without blocking on append since it's needed frequently by read
// paths that may call it concurrently. Typically this will be loaded from an
// atomic int. If the segment is empty lastIndex should return zero.
func (w *Writer) LastIndex() uint64 {
return atomic.LoadUint64(&w.commitIdx)
}
func readThroughSegment(r types.ReadableFile, fn func(info types.SegmentInfo, fh frameHeader, offset int64) (bool, error)) (*types.SegmentInfo, error) {
// First read the file header. Note we wrote it as part of the first commit so
// it may be missing or partial written and that's OK as long as there are no
// other later commit frames!
var fh [fileHeaderLen]byte
_, err := r.ReadAt(fh[:], 0)
// EOF is ok - the file might be empty if we crashed before committing
// anything and preallocation isn't supported.
if err != io.EOF && err != nil {
return nil, err
}
readInfo, err := readFileHeader(fh[:])
if err == types.ErrCorrupt {
// Header is malformed or missing, don't error yet though we'll detect it
// later when we know if it's a problem or not.
err = nil
}
if err != nil {
return nil, err
}
// If header wasn't detected as corrupt, it might still be just in a way
// that's valid since we've not verified it against the expected metadata yet.
// We'll wait to see if the header was part of the last commit before decide
// if we should validate it for corruption or not though. For now just make
// sure it's not nil so we don't have to handle nil checks everywhere.
if readInfo == nil {
// Zero info will fail validation against the actual metadata if it was
// corrupt when it shouldn't be later. Just prevents a nil panic.
readInfo = &types.SegmentInfo{}
}
// Read through file from after header until we hit zeros, EOF or corrupt
// frames.
offset := int64(fileHeaderLen)
var buf [frameHeaderLen]byte
for {
n, err := r.ReadAt(buf[:], offset)
if err == io.EOF {
if n < frameHeaderLen {
return readInfo, nil
}
// This is OK! The last frame in file might be a commit frame so as long
// as we have it all then we can ignore the EOF for this iteration.
err = nil
}
if err != nil {
return readInfo, fmt.Errorf("failed reading frame at offset=%d: %w", offset, err)
}
fh, err := readFrameHeader(buf[:frameHeaderLen])
if err != nil {
// This is not actually an error case. If we failed to decode it could be
// because of a torn write (since we don't assume writes are atomic). We
// assume that previously committed data is not silently corrupted by the
// FS (see README for details). So this must be due to corruption that
// happened due to non-atomic sector updates whilst committing the last
// write batch.
return readInfo, nil
}
if fh.typ == FrameInvalid {
// This means we've hit zeros at the end of the file (or due to an
// incomplete write, which we treat the same way).
return readInfo, nil
}
// Call the callback
shouldContinue, err := fn(*readInfo, fh, offset)
if err != nil {
return readInfo, err
}
if !shouldContinue {
return readInfo, nil
}
// Skip to next frame
offset += int64(encodedFrameSize(int(fh.len)))
}
}

215
vendor/github.com/hashicorp/raft-wal/state.go generated vendored Normal file
View File

@ -0,0 +1,215 @@
// Copyright (c) HashiCorp, Inc
// SPDX-License-Identifier: MPL-2.0
package wal
import (
"sync/atomic"
"github.com/benbjohnson/immutable"
"github.com/hashicorp/raft-wal/types"
)
// state is an immutable snapshot of the state of the log. Modifications must be
// made by copying and modifying the copy. This is easy enough because segments
// is an immutable map so changing and re-assigning to the clone won't impact
// the original map, and tail is just a pointer that can be mutated in the
// shallow clone. Note that methods called on the tail segmentWriter may mutate
// it's state so must only be called while holding the WAL's writeLock.
type state struct {
// refCount tracks readers that are reading segments based on this metadata.
// It is accessed atomically nd must be 64 bit aligned (i.e. leave it at the
// start of the struct).
refCount int32
// finaliser is set at most once while WAL is holding the write lock in order
// to provide a func that must be called when all current readers are done
// with this state. It's used for deferring closing and deleting old segments
// until we can be sure no reads are still in progress on them.
finalizer atomic.Value // func()
nextSegmentID uint64
// nextBaseIndex is used to signal which baseIndex to use next if there are no
// segments or current tail.
nextBaseIndex uint64
segments *immutable.SortedMap[uint64, segmentState]
tail types.SegmentWriter
}
type segmentState struct {
types.SegmentInfo
// r is the SegmentReader for our in-memory state.
r types.SegmentReader
}
// Commit converts the in-memory state into a PersistentState.
func (s *state) Persistent() types.PersistentState {
segs := make([]types.SegmentInfo, 0, s.segments.Len())
it := s.segments.Iterator()
for !it.Done() {
_, s, _ := it.Next()
segs = append(segs, s.SegmentInfo)
}
return types.PersistentState{
NextSegmentID: s.nextSegmentID,
Segments: segs,
}
}
func (s *state) getLog(index uint64) (*types.PooledBuffer, error) {
// Check the tail writer first
if s.tail != nil {
raw, err := s.tail.GetLog(index)
if err != nil && err != ErrNotFound {
// Return actual errors since they might mask the fact that index really
// is in the tail but failed to read for some other reason.
return nil, err
}
if err == nil {
// No error means we found it and just need to decode.
return raw, nil
}
// Not in the tail segment, fall back to searching previous segments.
}
seg, err := s.findSegmentReader(index)
if err != nil {
return nil, err
}
return seg.GetLog(index)
}
// findSegmentReader searches the segment tree for the segment that contains the
// log at index idx. It may return the tail segment which may not in fact
// contain idx if idx is larger than the last written index. Typically this is
// called after already checking with the tail writer whether the log is in
// there which means the caller can be sure it's not going to return the tail
// segment.
func (s *state) findSegmentReader(idx uint64) (types.SegmentReader, error) {
if s.segments.Len() == 0 {
return nil, ErrNotFound
}
// Search for a segment with baseIndex.
it := s.segments.Iterator()
// The baseIndex we want is the first one lower or equal to idx. Seek gets us
// to the first result equal or greater so we are either at it (if equal) or
// on the one _after_ the one we need. We step back since that's most likely
it.Seek(idx)
// The first call to Next/Prev actually returns the node the iterator is
// currently on (which is probably the one after the one we want) but in some
// edge cases we might actually want this one. Rather than reversing back and
// coming forward again, just check both this and the one before it.
_, seg, ok := it.Prev()
if ok && seg.BaseIndex > idx {
_, seg, ok = it.Prev()
}
// We either have the right segment or it doesn't exist.
if ok && seg.MinIndex <= idx && (seg.MaxIndex == 0 || seg.MaxIndex >= idx) {
return seg.r, nil
}
return nil, ErrNotFound
}
func (s *state) getTailInfo() *segmentState {
it := s.segments.Iterator()
it.Last()
_, tail, ok := it.Next()
if !ok {
return nil
}
return &tail
}
func (s *state) append(entries []types.LogEntry) error {
return s.tail.Append(entries)
}
func (s *state) firstIndex() uint64 {
it := s.segments.Iterator()
_, seg, ok := it.Next()
if !ok {
return 0
}
if seg.SealTime.IsZero() {
// First segment is unsealed so is also the tail. Check it actually has at
// least one log in otherwise it doesn't matter what the BaseIndex/MinIndex
// are.
if s.tail.LastIndex() == 0 {
// No logs in the WAL
return 0
}
// At least one log exists, return the MinIndex
}
return seg.MinIndex
}
func (s *state) lastIndex() uint64 {
tailIdx := s.tail.LastIndex()
if tailIdx > 0 {
return tailIdx
}
// Current tail is empty. Check there are previous sealed segments.
it := s.segments.Iterator()
it.Last()
_, _, ok := it.Prev()
if !ok {
// No tail! shouldn't be possible but means no logs yet
return 0
}
// Go back to the segment before the tail
_, _, ok = it.Prev()
if !ok {
// No previous segment so the whole log is empty
return 0
}
// There was a previous segment so it's MaxIndex will be one less than the
// tail's BaseIndex.
tailSeg := s.getTailInfo()
if tailSeg == nil || tailSeg.BaseIndex == 0 {
return 0
}
return tailSeg.BaseIndex - 1
}
func (s *state) acquire() func() {
atomic.AddInt32(&s.refCount, 1)
return s.release
}
func (s *state) release() {
// decrement on release
new := atomic.AddInt32(&s.refCount, -1)
if new == 0 {
// Cleanup state associated with this version now all refs have gone. Since
// there are no more refs and we should not set a finalizer until this state
// is no longer the active state, we can be sure this will happen only one.
// Even still lets swap the fn to ensure we only call finalizer once ever!
// We can't swap actual nil as it's not the same type as func() so do a
// dance with a nilFn below.
var nilFn func()
fnRaw := s.finalizer.Swap(nilFn)
if fn, ok := fnRaw.(func()); ok && fn != nil {
fn()
}
}
}
// clone returns a new state which is a shallow copy of just the immutable parts
// of s. This is safer than a simple assignment copy because that "reads" the
// atomically modified state non-atomically. We never want to copy the refCount
// or finalizer anyway.
func (s *state) clone() state {
return state{
nextSegmentID: s.nextSegmentID,
segments: s.segments,
tail: s.tail,
}
}

21
vendor/github.com/hashicorp/raft-wal/types/buffer.go generated vendored Normal file
View File

@ -0,0 +1,21 @@
// Copyright (c) HashiCorp, Inc
// SPDX-License-Identifier: MPL-2.0
package types
// PooledBuffer is a wrapper that allows WAL to return read buffers to segment
// implementations when we're done decoding.
type PooledBuffer struct {
Bs []byte
CloseFn func()
}
// Close implements io.Closer and returns the buffer to the pool. It should be
// called exactly once for each buffer when it's no longer needed. It's no
// longer safe to access Bs or any slice taken from it after the call.
func (b *PooledBuffer) Close() error {
if b.CloseFn != nil {
b.CloseFn()
}
return nil
}

41
vendor/github.com/hashicorp/raft-wal/types/meta.go generated vendored Normal file
View File

@ -0,0 +1,41 @@
// Copyright (c) HashiCorp, Inc
// SPDX-License-Identifier: MPL-2.0
package types
import "io"
// MetaStore is the interface we need to some persistent, crash safe backend. We
// implement it with BoltDB for real usage but the interface allows alternatives
// to be used, or tests to mock out FS access.
type MetaStore interface {
// Load loads the existing persisted state. If there is no existing state
// implementations are expected to create initialize new storage and return an
// empty state.
Load(dir string) (PersistentState, error)
// CommitState must atomically replace all persisted metadata in the current
// store with the set provided. It must not return until the data is persisted
// durably and in a crash-safe way otherwise the guarantees of the WAL will be
// compromised. The WAL will only ever call this in a single thread at one
// time and it will never be called concurrently with Load however it may be
// called concurrently with Get/SetStable operations.
CommitState(PersistentState) error
// GetStable returns a value from stable store or nil if it doesn't exist. May
// be called concurrently by multiple threads.
GetStable(key []byte) ([]byte, error)
// SetStable stores a value from stable store. May be called concurrently with
// GetStable.
SetStable(key, value []byte) error
io.Closer
}
// PersistentState represents the WAL file metadata we need to store reliably to
// recover on restart.
type PersistentState struct {
NextSegmentID uint64
Segments []SegmentInfo
}

150
vendor/github.com/hashicorp/raft-wal/types/segment.go generated vendored Normal file
View File

@ -0,0 +1,150 @@
// Copyright (c) HashiCorp, Inc
// SPDX-License-Identifier: MPL-2.0
package types
import (
"io"
"time"
)
// SegmentInfo is the metadata describing a single WAL segment.
type SegmentInfo struct {
// ID uniquely identifies this segment file
ID uint64
// BaseIndex is the raft index of the first entry that will be written to the
// segment.
BaseIndex uint64
// MinIndex is the logical lowest index that still exists in the segment. It
// may be greater than BaseIndex if a head truncation has "deleted" a prefix
// of the segment.
MinIndex uint64
// MaxIndex is the logical highest index that still exists in the segment. It
// may be lower than the actual highest index if a tail truncation has
// "deleted" a suffix of the segment. It is zero for unsealed segments and
// only set one seal.
MaxIndex uint64
// Codec identifies the codec used to encode log entries. Codec values 0 to
// 16k (i.e. the lower 16 bits) are reserved for internal future usage. Custom
// codecs must be registered with an identifier higher than this which the
// caller is responsible for ensuring uniquely identifies the specific version
// of their codec used in any given log. uint64 provides sufficient space that
// a randomly generated identifier is almost certainly unique.
Codec uint64
// IndexStart is the file offset where the index can be read from it's 0 for
// tail segments and only set after a segment is sealed.
IndexStart uint64
// CreateTime records when the segment was first created.
CreateTime time.Time
// SealTime records when the segment was sealed. Zero indicates that it's not
// sealed yet.
SealTime time.Time
// SizeLimit is the soft limit for the segment's size. The segment file may be
// pre-allocated to this size on filesystems that support it. It is a soft
// limit in the sense that the final Append usually takes the segment file
// past this size before it is considered full and sealed.
SizeLimit uint32
}
// SegmentFiler is the interface that provides access to segments to the WAL. It
// encapsulated creating, and recovering segments and returning reader or writer
// interfaces to interact with them. It's main purpose is to abstract the core
// WAL logic both from the actual encoding layer of segment files. You can think
// of it as a layer of abstraction above the VFS which abstracts actual file
// system operations on files but knows nothing about the format. In tests for
// example we can implement a SegmentFiler that is way simpler than the real
// encoding/decoding layer on top of a VFS - even an in-memory VFS which makes
// tests much simpler to write and run.
type SegmentFiler interface {
// Create adds a new segment with the given info and returns a writer or an
// error.
Create(info SegmentInfo) (SegmentWriter, error)
// RecoverTail is called on an unsealed segment when re-opening the WAL it
// will attempt to recover from a possible crash. It will either return an
// error, or return a valid segmentWriter that is ready for further appends.
// If the expected tail segment doesn't exist it must return an error wrapping
// os.ErrNotExist.
RecoverTail(info SegmentInfo) (SegmentWriter, error)
// Open an already sealed segment for reading. Open may validate the file's
// header and return an error if it doesn't match the expected info.
Open(info SegmentInfo) (SegmentReader, error)
// List returns the set of segment IDs currently stored. It's used by the WAL
// on recovery to find any segment files that need to be deleted following a
// unclean shutdown. The returned map is a map of ID -> BaseIndex. BaseIndex
// is returned to allow subsequent Delete calls to be made.
List() (map[uint64]uint64, error)
// Delete removes the segment with given baseIndex and id if it exists. Note
// that baseIndex is technically redundant since ID is unique on it's own. But
// in practice we name files (or keys) with both so that they sort correctly.
// This interface allows a simpler implementation where we can just delete
// the file if it exists without having to scan the underlying storage for a.
Delete(baseIndex, ID uint64) error
}
// SegmentWriter manages appending logs to the tail segment of the WAL. It's an
// interface to make testing core WAL simpler. Every SegmentWriter will have
// either `init` or `recover` called once before any other methods. When either
// returns it must either return an error or be ready to accept new writes and
// reads.
type SegmentWriter interface {
io.Closer
SegmentReader
// Append adds one or more entries. It must not return until the entries are
// durably stored otherwise raft's guarantees will be compromised. Append must
// not be called concurrently with any other call to Sealed, Append or
// ForceSeal.
Append(entries []LogEntry) error
// Sealed returns whether the segment is sealed or not. If it is it returns
// true and the file offset that it's index array starts at to be saved in
// meta data. WAL will call this after every append so it should be relatively
// cheap in the common case. This design allows the final Append to write out
// the index or any additional data needed at seal time in the same fsync.
// Sealed must not be called concurrently with any other call to Sealed,
// Append or ForceSeal.
Sealed() (bool, uint64, error)
// ForceSeal causes the segment to become sealed by writing out an index
// block. This is not used in the typical flow of append and rotation, but is
// necessary during truncations where some suffix of the writer needs to be
// truncated. Rather than manipulate what is on disk in a complex way, the WAL
// will simply force seal it with whatever state it has already saved and then
// open a new segment at the right offset for continued writing. ForceSeal may
// be called on a segment that has already been sealed and should just return
// the existing index offset in that case. (We don't actually rely on that
// currently but it's easier not to assume we'll always call it at most once).
// ForceSeal must not be called concurrently with any other call to Sealed,
// Append or ForceSeal.
ForceSeal() (uint64, error)
// LastIndex returns the most recently persisted index in the log. It must
// respond without blocking on Append since it's needed frequently by read
// paths that may call it concurrently. Typically this will be loaded from an
// atomic int. If the segment is empty lastIndex should return zero.
LastIndex() uint64
}
// SegmentReader wraps a ReadableFile to allow lookup of logs in an existing
// segment file. It's an interface to make testing core WAL simpler. The first
// call will always be validate which passes in the ReaderAt to be used for
// subsequent reads.
type SegmentReader interface {
io.Closer
// GetLog returns the raw log entry bytes associated with idx. If the log
// doesn't exist in this segment ErrNotFound must be returned.
GetLog(idx uint64) (*PooledBuffer, error)
}

27
vendor/github.com/hashicorp/raft-wal/types/types.go generated vendored Normal file
View File

@ -0,0 +1,27 @@
// Copyright (c) HashiCorp, Inc
// SPDX-License-Identifier: MPL-2.0
package types
import (
"errors"
"github.com/hashicorp/raft"
)
var (
// ErrNotFound is our own version of raft's not found error. It's important
// it's exactly the same because the raft lib checks for equality with it's
// own type as a crucial part of replication processing (detecting end of logs
// and that a snapshot is needed for a follower).
ErrNotFound = raft.ErrLogNotFound
ErrCorrupt = errors.New("WAL is corrupt")
ErrSealed = errors.New("segment is sealed")
ErrClosed = errors.New("closed")
)
// LogEntry represents an entry that has already been encoded.
type LogEntry struct {
Index uint64
Data []byte
}

59
vendor/github.com/hashicorp/raft-wal/types/vfs.go generated vendored Normal file
View File

@ -0,0 +1,59 @@
// Copyright (c) HashiCorp, Inc
// SPDX-License-Identifier: MPL-2.0
package types
import "io"
// VFS is the interface WAL needs to interact with the file system. In
// production it would normally be implemented by RealFS which interacts with
// the operating system FS using standard go os package. It's useful to allow
// testing both to run quicker (by being in memory only) and to make it easy to
// simulate all kinds of disk errors and failure modes without needing a more
// elaborate external test harness like ALICE.
type VFS interface {
// ListDir returns a list of all files in the specified dir in lexicographical
// order. If the dir doesn't exist, it must return an error. Empty array with
// nil error is assumed to mean that the directory exists and was readable,
// but contains no files.
ListDir(dir string) ([]string, error)
// Create creates a new file with the given name. If a file with the same name
// already exists an error is returned. If a non-zero size is given,
// implementations should make a best effort to pre-allocate the file to be
// that size. The dir must already exist and be writable to the current
// process.
Create(dir, name string, size uint64) (WritableFile, error)
// Delete indicates the file is no longer required. Typically it should be
// deleted from the underlying system to free disk space.
Delete(dir, name string) error
// OpenReader opens an existing file in read-only mode. If the file doesn't
// exist or permission is denied, an error is returned, otherwise no checks
// are made about the well-formedness of the file, it may be empty, the wrong
// size or corrupt in arbitrary ways.
OpenReader(dir, name string) (ReadableFile, error)
// OpenWriter opens a file in read-write mode. If the file doesn't exist or
// permission is denied, an error is returned, otherwise no checks are made
// about the well-formedness of the file, it may be empty, the wrong size or
// corrupt in arbitrary ways.
OpenWriter(dir, name string) (WritableFile, error)
}
// WritableFile provides random read-write access to a file as well as the
// ability to fsync it to disk.
type WritableFile interface {
io.WriterAt
io.ReaderAt
io.Closer
Sync() error
}
// ReadableFile provides random read access to a file.
type ReadableFile interface {
io.ReaderAt
io.Closer
}

957
vendor/github.com/hashicorp/raft-wal/wal.go generated vendored Normal file
View File

@ -0,0 +1,957 @@
// Copyright (c) HashiCorp, Inc
// SPDX-License-Identifier: MPL-2.0
package wal
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"os"
"sync"
"sync/atomic"
"time"
"github.com/benbjohnson/immutable"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/raft"
"github.com/hashicorp/raft-wal/metrics"
"github.com/hashicorp/raft-wal/types"
)
var (
_ raft.LogStore = &WAL{}
_ raft.StableStore = &WAL{}
ErrNotFound = types.ErrNotFound
ErrCorrupt = types.ErrCorrupt
ErrSealed = types.ErrSealed
ErrClosed = types.ErrClosed
DefaultSegmentSize = 64 * 1024 * 1024
)
var (
_ raft.LogStore = &WAL{}
_ raft.MonotonicLogStore = &WAL{}
_ raft.StableStore = &WAL{}
)
// WAL is a write-ahead log suitable for github.com/hashicorp/raft.
type WAL struct {
closed uint32 // atomically accessed to keep it first in struct for alignment.
dir string
codec Codec
sf types.SegmentFiler
metaDB types.MetaStore
metrics metrics.Collector
log hclog.Logger
segmentSize int
// s is the current state of the WAL files. It is an immutable snapshot that
// can be accessed without a lock when reading. We only support a single
// writer so all methods that mutate either the WAL state or append to the
// tail of the log must hold the writeMu until they complete all changes.
s atomic.Value // *state
// writeMu must be held when modifying s or while appending to the tail.
// Although we take care never to let readers block writer, we still only
// allow a single writer to be updating the meta state at once. The mutex must
// be held before s is loaded until all modifications to s or appends to the
// tail are complete.
writeMu sync.Mutex
// These chans are used to hand off serial execution for segment rotation to a
// background goroutine so that StoreLogs can return and allow the caller to
// get on with other work while we mess with files. The next call to StoreLogs
// needs to wait until the background work is done though since the current
// log is sealed.
//
// At the end of StoreLogs, if the segment was sealed, still holding writeMu
// we make awaitRotate so it's non-nil, then send the indexStart on
// triggerRotate which is 1-buffered. We then drop the lock and return to
// caller. The rotation goroutine reads from triggerRotate in a loop, takes
// the write lock performs rotation and then closes awaitRotate and sets it to
// nil before releasing the lock. The next StoreLogs call takes the lock,
// checks if awaitRotate. If it is nil there is no rotation going on so
// StoreLogs can proceed. If it is non-nil, it releases the lock and then
// waits on the close before acquiring the lock and continuing.
triggerRotate chan uint64
awaitRotate chan struct{}
}
type walOpt func(*WAL)
// Open attempts to open the WAL stored in dir. If there are no existing WAL
// files a new WAL will be initialized there. The dir must already exist and be
// readable and writable to the current process. If existing files are found,
// recovery is attempted. If recovery is not possible an error is returned,
// otherwise the returned *WAL is in a state ready for use.
func Open(dir string, opts ...walOpt) (*WAL, error) {
w := &WAL{
dir: dir,
triggerRotate: make(chan uint64, 1),
}
// Apply options
for _, opt := range opts {
opt(w)
}
if err := w.applyDefaultsAndValidate(); err != nil {
return nil, err
}
// Load or create metaDB
persisted, err := w.metaDB.Load(w.dir)
if err != nil {
return nil, err
}
newState := state{
segments: &immutable.SortedMap[uint64, segmentState]{},
nextSegmentID: persisted.NextSegmentID,
}
// Get the set of all persisted segments so we can prune it down to just the
// unused ones as we go.
toDelete, err := w.sf.List()
if err != nil {
return nil, err
}
// Build the state
recoveredTail := false
for i, si := range persisted.Segments {
// Verify we can decode the entries.
// TODO: support multiple decoders to allow rotating codec.
if si.Codec != w.codec.ID() {
return nil, fmt.Errorf("segment with BasedIndex=%d uses an unknown codec", si.BaseIndex)
}
// We want to keep this segment since it's still in the metaDB list!
delete(toDelete, si.ID)
if si.SealTime.IsZero() {
// This is an unsealed segment. It _must_ be the last one. Safety check!
if i < len(persisted.Segments)-1 {
return nil, fmt.Errorf("unsealed segment is not at tail")
}
// Try to recover this segment
sw, err := w.sf.RecoverTail(si)
if errors.Is(err, os.ErrNotExist) {
// Handle no file specially. This can happen if we crashed right after
// persisting the metadata but before we managed to persist the new
// file. In fact it could happen if the whole machine looses power any
// time before the fsync of the parent dir since the FS could loose the
// dir entry for the new file until that point. We do ensure we pass
// that point before we return from Append for the first time in that
// new file so that's safe, but we have to handle recovering from that
// case here.
sw, err = w.sf.Create(si)
}
if err != nil {
return nil, err
}
// Set the tail and "reader" for this segment
ss := segmentState{
SegmentInfo: si,
r: sw,
}
newState.tail = sw
newState.segments = newState.segments.Set(si.BaseIndex, ss)
recoveredTail = true
// We're done with this loop, break here to avoid nesting all the rest of
// the logic!
break
}
// This is a sealed segment
// Open segment reader
sr, err := w.sf.Open(si)
if err != nil {
return nil, err
}
// Store the open reader to get logs from
ss := segmentState{
SegmentInfo: si,
r: sr,
}
newState.segments = newState.segments.Set(si.BaseIndex, ss)
}
if !recoveredTail {
// There was no unsealed segment at the end. This can only really happen
// when the log is empty with zero segments (either on creation or after a
// truncation that removed all segments) since we otherwise never allow the
// state to have a sealed tail segment. But this logic works regardless!
// Create a new segment. We use baseIndex of 1 even though the first append
// might be much higher - we'll allow that since we know we have no records
// yet and so lastIndex will also be 0.
si := w.newSegment(newState.nextSegmentID, 1)
newState.nextSegmentID++
ss := segmentState{
SegmentInfo: si,
}
newState.segments = newState.segments.Set(si.BaseIndex, ss)
// Persist the new meta to "commit" it even before we create the file so we
// don't attempt to recreate files with duplicate IDs on a later failure.
if err := w.metaDB.CommitState(newState.Persistent()); err != nil {
return nil, err
}
// Create the new segment file
w, err := w.sf.Create(si)
if err != nil {
return nil, err
}
newState.tail = w
// Update the segment in memory so we have a reader for the new segment. We
// don't need to commit again as this isn't changing the persisted metadata
// about the segment.
ss.r = w
newState.segments = newState.segments.Set(si.BaseIndex, ss)
}
// Store the in-memory state (it was already persisted if we modified it
// above) there are no readers yet since we are constructing a new WAL so we
// don't need to jump through the mutateState hoops yet!
w.s.Store(&newState)
// Delete any unused segment files left over after a crash.
w.deleteSegments(toDelete)
// Start the rotation routine
go w.runRotate()
return w, nil
}
// stateTxn represents a transaction body that mutates the state under the
// writeLock. s is already a shallow copy of the current state that may be
// mutated as needed. If a nil error is returned, s will be atomically set as
// the new state. If a non-nil finalizer func is returned it will be atomically
// attached to the old state after it's been replaced but before the write lock
// is released. The finalizer will be called exactly once when all current
// readers have released the old state. If the transaction func returns a
// non-nil postCommit it is executed after the new state has been committed to
// metaDB. It may mutate the state further (captured by closure) before it is
// atomically committed in memory but the update won't be persisted to disk in
// this transaction. This is used where we need sequencing between committing
// meta and creating and opening a new file. Both need to happen in memory in
// one transaction but the disk commit isn't at the end! If postCommit returns
// an error, the state is not updated in memory and the error is returned to the
// mutate caller.
type stateTxn func(s *state) (finalizer func(), postCommit func() error, err error)
func (w *WAL) loadState() *state {
return w.s.Load().(*state)
}
// mutateState executes a stateTxn. writeLock MUST be held while calling this.
func (w *WAL) mutateStateLocked(tx stateTxn) error {
s := w.loadState()
s.acquire()
defer s.release()
newS := s.clone()
fn, postCommit, err := tx(&newS)
if err != nil {
return err
}
// Commit updates to meta
if err := w.metaDB.CommitState(newS.Persistent()); err != nil {
return err
}
if postCommit != nil {
if err := postCommit(); err != nil {
return err
}
}
w.s.Store(&newS)
s.finalizer.Store(fn)
return nil
}
// acquireState should be used by all readers to fetch the current state. The
// returned release func must be called when no further accesses to state or the
// data within it will be performed to free old files that may have been
// truncated concurrently.
func (w *WAL) acquireState() (*state, func()) {
s := w.loadState()
return s, s.acquire()
}
// newSegment creates a types.SegmentInfo with the passed ID and baseIndex, filling in
// the segment parameters based on the current WAL configuration.
func (w *WAL) newSegment(ID, baseIndex uint64) types.SegmentInfo {
return types.SegmentInfo{
ID: ID,
BaseIndex: baseIndex,
MinIndex: baseIndex,
SizeLimit: uint32(w.segmentSize),
// TODO make these configurable
Codec: CodecBinaryV1,
CreateTime: time.Now(),
}
}
// FirstIndex returns the first index written. 0 for no entries.
func (w *WAL) FirstIndex() (uint64, error) {
if err := w.checkClosed(); err != nil {
return 0, err
}
s, release := w.acquireState()
defer release()
return s.firstIndex(), nil
}
// LastIndex returns the last index written. 0 for no entries.
func (w *WAL) LastIndex() (uint64, error) {
if err := w.checkClosed(); err != nil {
return 0, err
}
s, release := w.acquireState()
defer release()
return s.lastIndex(), nil
}
// GetLog gets a log entry at a given index.
func (w *WAL) GetLog(index uint64, log *raft.Log) error {
if err := w.checkClosed(); err != nil {
return err
}
s, release := w.acquireState()
defer release()
w.metrics.IncrementCounter("log_entries_read", 1)
raw, err := s.getLog(index)
if err != nil {
return err
}
w.metrics.IncrementCounter("log_entry_bytes_read", uint64(len(raw.Bs)))
defer raw.Close()
// Decode the log
return w.codec.Decode(raw.Bs, log)
}
// StoreLog stores a log entry.
func (w *WAL) StoreLog(log *raft.Log) error {
return w.StoreLogs([]*raft.Log{log})
}
// StoreLogs stores multiple log entries.
func (w *WAL) StoreLogs(logs []*raft.Log) error {
if err := w.checkClosed(); err != nil {
return err
}
if len(logs) < 1 {
return nil
}
w.writeMu.Lock()
defer w.writeMu.Unlock()
// Ensure queued rotation has completed before us if we raced with it for
// write lock.
w.awaitRotationLocked()
s, release := w.acquireState()
defer release()
// Verify monotonicity since we assume it
lastIdx := s.lastIndex()
// Special case, if the log is currently empty and this is the first append,
// we allow any starting index. We've already created an empty tail segment
// though and probably started at index 1. Rather than break the invariant
// that BaseIndex is the same as the first index in the segment (which causes
// lots of extra complexity lower down) we simply accept the additional cost
// in this rare case of removing the current tail and re-creating it with the
// correct BaseIndex for the first log we are about to append. In practice,
// this only happens on startup of a new server, or after a user snapshot
// restore which are both rare enough events that the cost is not significant
// since the cost of creating other state or restoring snapshots is larger
// anyway. We could theoretically defer creating at all until we know for sure
// but that is more complex internally since then everything has to handle the
// uninitialized case where the is no tail yet with special cases.
ti := s.getTailInfo()
// Note we check index != ti.BaseIndex rather than index != 1 so that this
// works even if we choose to initialize first segments to a BaseIndex other
// than 1. For example it might be marginally more performant to choose to
// initialize to the old MaxIndex + 1 after a truncate since that is what our
// raft library will use after a restore currently so will avoid this case on
// the next append, while still being generally safe.
if lastIdx == 0 && logs[0].Index != ti.BaseIndex {
if err := w.resetEmptyFirstSegmentBaseIndex(logs[0].Index); err != nil {
return err
}
// Re-read state now we just changed it.
s2, release2 := w.acquireState()
defer release2()
// Overwrite the state we read before so the code below uses the new state
s = s2
}
// Encode logs
nBytes := uint64(0)
encoded := make([]types.LogEntry, len(logs))
for i, l := range logs {
if lastIdx > 0 && l.Index != (lastIdx+1) {
return fmt.Errorf("non-monotonic log entries: tried to append index %d after %d", logs[0].Index, lastIdx)
}
// Need a new buffer each time because Data is just a slice so if we re-use
// buffer then all end up pointing to the same underlying data which
// contains only the final log value!
var buf bytes.Buffer
if err := w.codec.Encode(l, &buf); err != nil {
return err
}
encoded[i].Data = buf.Bytes()
encoded[i].Index = l.Index
lastIdx = l.Index
nBytes += uint64(len(encoded[i].Data))
}
if err := s.tail.Append(encoded); err != nil {
return err
}
w.metrics.IncrementCounter("log_appends", 1)
w.metrics.IncrementCounter("log_entries_written", uint64(len(encoded)))
w.metrics.IncrementCounter("log_entry_bytes_written", nBytes)
// Check if we need to roll logs
sealed, indexStart, err := s.tail.Sealed()
if err != nil {
return err
}
if sealed {
// Async rotation to allow caller to do more work while we mess with files.
w.triggerRotateLocked(indexStart)
}
return nil
}
func (w *WAL) awaitRotationLocked() {
awaitCh := w.awaitRotate
if awaitCh != nil {
// We managed to race for writeMu with the background rotate operation which
// needs to complete first. Wait for it to complete.
w.writeMu.Unlock()
<-awaitCh
w.writeMu.Lock()
}
}
// DeleteRange deletes a range of log entries. The range is inclusive.
// Implements raft.LogStore. Note that we only support deleting ranges that are
// a suffix or prefix of the log.
func (w *WAL) DeleteRange(min uint64, max uint64) error {
if err := w.checkClosed(); err != nil {
return err
}
if min > max {
// Empty inclusive range.
return nil
}
w.writeMu.Lock()
defer w.writeMu.Unlock()
// Ensure queued rotation has completed before us if we raced with it for
// write lock.
w.awaitRotationLocked()
s, release := w.acquireState()
defer release()
// Work out what type of truncation this is.
first, last := s.firstIndex(), s.lastIndex()
switch {
// |min----max|
// |first====last|
// or
// |min----max|
// |first====last|
case max < first || min > last:
// None of the range exists at all so a no-op
return nil
// |min----max|
// |first====last|
// or
// |min--------------max|
// |first====last|
// or
// |min--max|
// |first====last|
case min <= first: // max >= first implied by the first case not matching
// Note we allow head truncations where max > last which effectively removes
// the entire log.
return w.truncateHeadLocked(max + 1)
// |min----max|
// |first====last|
// or
// |min--------------max|
// |first====last|
case max >= last: // min <= last implied by first case not matching
return w.truncateTailLocked(min - 1)
// |min----max|
// |first========last|
default:
// Everything else is a neither a suffix nor prefix so unsupported.
return fmt.Errorf("only suffix or prefix ranges may be deleted from log")
}
}
// Set implements raft.StableStore
func (w *WAL) Set(key []byte, val []byte) error {
if err := w.checkClosed(); err != nil {
return err
}
w.metrics.IncrementCounter("stable_sets", 1)
return w.metaDB.SetStable(key, val)
}
// Get implements raft.StableStore
func (w *WAL) Get(key []byte) ([]byte, error) {
if err := w.checkClosed(); err != nil {
return nil, err
}
w.metrics.IncrementCounter("stable_gets", 1)
return w.metaDB.GetStable(key)
}
// SetUint64 implements raft.StableStore. We assume the same key space as Set
// and Get so the caller is responsible for ensuring they don't call both Set
// and SetUint64 for the same key.
func (w *WAL) SetUint64(key []byte, val uint64) error {
var buf [8]byte
binary.LittleEndian.PutUint64(buf[:], val)
return w.Set(key, buf[:])
}
// GetUint64 implements raft.StableStore. We assume the same key space as Set
// and Get. We assume that the key was previously set with `SetUint64` and
// returns an undefined value (possibly with nil error) if not.
func (w *WAL) GetUint64(key []byte) (uint64, error) {
raw, err := w.Get(key)
if err != nil {
return 0, err
}
if len(raw) == 0 {
// Not set, return zero per interface contract
return 0, nil
}
// At least a tiny bit of checking is possible
if len(raw) != 8 {
return 0, fmt.Errorf("GetUint64 called on a non-uint64 key")
}
return binary.LittleEndian.Uint64(raw), nil
}
func (w *WAL) triggerRotateLocked(indexStart uint64) {
if atomic.LoadUint32(&w.closed) == 1 {
return
}
w.awaitRotate = make(chan struct{})
w.triggerRotate <- indexStart
}
func (w *WAL) runRotate() {
for {
indexStart := <-w.triggerRotate
w.writeMu.Lock()
// Either triggerRotate was closed by Close, or Close raced with a real
// trigger, either way shut down without changing anything else. In the
// second case the segment file is sealed but meta data isn't updated yet
// but we have to handle that case during recovery anyway so it's simpler
// not to try and complete the rotation here on an already-closed WAL.
closed := atomic.LoadUint32(&w.closed)
if closed == 1 {
w.writeMu.Unlock()
return
}
err := w.rotateSegmentLocked(indexStart)
if err != nil {
// The only possible errors indicate bugs and could probably validly be
// panics, but be conservative and just attempt to log them instead!
w.log.Error("rotate error", "err", err)
}
done := w.awaitRotate
w.awaitRotate = nil
w.writeMu.Unlock()
// Now we are done, close the channel to unblock the waiting writer if there
// is one
close(done)
}
}
func (w *WAL) rotateSegmentLocked(indexStart uint64) error {
txn := func(newState *state) (func(), func() error, error) {
// Mark current tail as sealed in segments
tail := newState.getTailInfo()
if tail == nil {
// Can't happen
return nil, nil, fmt.Errorf("no tail found during rotate")
}
// Note that tail is a copy since it's a value type. Even though this is a
// pointer here it's pointing to a copy on the heap that was made in
// getTailInfo above, so we can mutate it safely and update the immutable
// state with our version.
tail.SealTime = time.Now()
tail.MaxIndex = newState.tail.LastIndex()
tail.IndexStart = indexStart
w.metrics.SetGauge("last_segment_age_seconds", uint64(tail.SealTime.Sub(tail.CreateTime).Seconds()))
// Update the old tail with the seal time etc.
newState.segments = newState.segments.Set(tail.BaseIndex, *tail)
post, err := w.createNextSegment(newState)
return nil, post, err
}
w.metrics.IncrementCounter("segment_rotations", 1)
return w.mutateStateLocked(txn)
}
// createNextSegment is passes a mutable copy of the new state ready to have a
// new segment appended. newState must be a copy, taken under write lock which
// is still held by the caller and its segments map must contain all non-tail
// segments that should be in the log, all must be sealed at this point. The new
// segment's baseIndex will be the current last-segment's MaxIndex + 1 (or 1 if
// no current tail segment). The func returned is to be executed post
// transaction commit to create the actual segment file.
func (w *WAL) createNextSegment(newState *state) (func() error, error) {
// Find existing sealed tail
tail := newState.getTailInfo()
// If there is no tail, next baseIndex is 1 (or the requested next base index)
nextBaseIndex := uint64(1)
if tail != nil {
nextBaseIndex = tail.MaxIndex + 1
} else if newState.nextBaseIndex > 0 {
nextBaseIndex = newState.nextBaseIndex
}
// Create a new segment
newTail := w.newSegment(newState.nextSegmentID, nextBaseIndex)
newState.nextSegmentID++
ss := segmentState{
SegmentInfo: newTail,
}
newState.segments = newState.segments.Set(newTail.BaseIndex, ss)
// We're ready to commit now! Return a postCommit that will actually create
// the segment file once meta is persisted. We don't do it in parallel because
// we don't want to persist a file with an ID before that ID is durably stored
// in case the metaDB write doesn't happen.
post := func() error {
// Now create the new segment for writing.
sw, err := w.sf.Create(newTail)
if err != nil {
return err
}
newState.tail = sw
// Also cache the reader/log getter which is also the writer. We don't bother
// reopening read only since we assume we have exclusive access anyway and
// only use this read-only interface once the segment is sealed.
ss.r = newState.tail
// We need to re-insert it since newTail is a copy not a reference
newState.segments = newState.segments.Set(newTail.BaseIndex, ss)
return nil
}
return post, nil
}
// resetEmptyFirstSegmentBaseIndex is used to change the baseIndex of the tail
// segment file if its empty. This is needed when the first log written has a
// different index to the base index that was assumed when the tail was created
// (e.g. on startup). It will return an error if the log is not currently empty.
func (w *WAL) resetEmptyFirstSegmentBaseIndex(newBaseIndex uint64) error {
txn := stateTxn(func(newState *state) (func(), func() error, error) {
if newState.lastIndex() > 0 {
return nil, nil, fmt.Errorf("can't reset BaseIndex on segment, log is not empty")
}
fin := func() {}
tailSeg := newState.getTailInfo()
if tailSeg != nil {
// There is an existing tail. Check if it needs to be replaced
if tailSeg.BaseIndex == newBaseIndex {
// It's fine as it is, no-op
return nil, nil, nil
}
// It needs to be removed
newState.segments = newState.segments.Delete(tailSeg.BaseIndex)
newState.tail = nil
fin = func() {
w.closeSegments([]io.Closer{tailSeg.r})
w.deleteSegments(map[uint64]uint64{tailSeg.ID: tailSeg.BaseIndex})
}
}
// Ensure the newly created tail has the right base index
newState.nextBaseIndex = newBaseIndex
// Create the new segment
post, err := w.createNextSegment(newState)
if err != nil {
return nil, nil, err
}
return fin, post, nil
})
return w.mutateStateLocked(txn)
}
func (w *WAL) truncateHeadLocked(newMin uint64) error {
txn := stateTxn(func(newState *state) (func(), func() error, error) {
oldLastIndex := newState.lastIndex()
// Iterate the segments to find any that are entirely deleted.
toDelete := make(map[uint64]uint64)
toClose := make([]io.Closer, 0, 1)
it := newState.segments.Iterator()
var head *segmentState
nTruncated := uint64(0)
for !it.Done() {
_, seg, _ := it.Next()
maxIdx := seg.MaxIndex
// If the segment is the tail (unsealed) or a sealed segment that contains
// this new min then we've found the new head.
if seg.SealTime.IsZero() {
maxIdx = newState.lastIndex()
// This is the tail, check if it actually has any content to keep
if maxIdx >= newMin {
head = &seg
break
}
} else if seg.MaxIndex >= newMin {
head = &seg
break
}
toDelete[seg.ID] = seg.BaseIndex
toClose = append(toClose, seg.r)
newState.segments = newState.segments.Delete(seg.BaseIndex)
nTruncated += (maxIdx - seg.MinIndex + 1) // +1 because MaxIndex is inclusive
}
// There may not be any segments (left) but if there are, update the new
// head's MinIndex.
var postCommit func() error
if head != nil {
// new
nTruncated += (newMin - head.MinIndex)
head.MinIndex = newMin
newState.segments = newState.segments.Set(head.BaseIndex, *head)
} else {
// If there is no head any more, then there is no tail either! We should
// create a new blank one ready for use when we next append like we do
// during initialization. As an optimization, we create it with a
// BaseIndex of the old MaxIndex + 1 since this is what our Raft library
// uses as the next log index after a restore so this avoids recreating
// the files a second time on the next append.
newState.nextBaseIndex = oldLastIndex + 1
pc, err := w.createNextSegment(newState)
if err != nil {
return nil, nil, err
}
postCommit = pc
}
w.metrics.IncrementCounter("head_truncations", nTruncated)
// Return a finalizer that will be called when all readers are done with the
// segments in the current state to close and delete old segments.
fin := func() {
w.closeSegments(toClose)
w.deleteSegments(toDelete)
}
return fin, postCommit, nil
})
return w.mutateStateLocked(txn)
}
func (w *WAL) truncateTailLocked(newMax uint64) error {
txn := stateTxn(func(newState *state) (func(), func() error, error) {
// Reverse iterate the segments to find any that are entirely deleted.
toDelete := make(map[uint64]uint64)
toClose := make([]io.Closer, 0, 1)
it := newState.segments.Iterator()
it.Last()
nTruncated := uint64(0)
for !it.Done() {
_, seg, _ := it.Prev()
if seg.BaseIndex <= newMax {
// We're done
break
}
maxIdx := seg.MaxIndex
if seg.SealTime.IsZero() {
maxIdx = newState.lastIndex()
}
toDelete[seg.ID] = seg.BaseIndex
toClose = append(toClose, seg.r)
newState.segments = newState.segments.Delete(seg.BaseIndex)
nTruncated += (maxIdx - seg.MinIndex + 1) // +1 because MaxIndex is inclusive
}
tail := newState.getTailInfo()
if tail != nil {
maxIdx := tail.MaxIndex
// Check that the tail is sealed (it won't be if we didn't need to remove
// the actual partial tail above).
if tail.SealTime.IsZero() {
// Actually seal it (i.e. force it to write out an index block wherever
// it got to).
indexStart, err := newState.tail.ForceSeal()
if err != nil {
return nil, nil, err
}
tail.IndexStart = indexStart
tail.SealTime = time.Now()
maxIdx = newState.lastIndex()
}
// Update the MaxIndex
nTruncated += (maxIdx - newMax)
tail.MaxIndex = newMax
// And update the tail in the new state
newState.segments = newState.segments.Set(tail.BaseIndex, *tail)
}
// Create the new tail segment
pc, err := w.createNextSegment(newState)
if err != nil {
return nil, nil, err
}
w.metrics.IncrementCounter("tail_truncations", nTruncated)
// Return a finalizer that will be called when all readers are done with the
// segments in the current state to close and delete old segments.
fin := func() {
w.closeSegments(toClose)
w.deleteSegments(toDelete)
}
return fin, pc, nil
})
return w.mutateStateLocked(txn)
}
func (w *WAL) deleteSegments(toDelete map[uint64]uint64) {
for ID, baseIndex := range toDelete {
if err := w.sf.Delete(baseIndex, ID); err != nil {
// This is not fatal. We can continue just old files might need manual
// cleanup somehow.
w.log.Error("failed to delete old segment", "baseIndex", baseIndex, "id", ID, "err", err)
}
}
}
func (w *WAL) closeSegments(toClose []io.Closer) {
for _, c := range toClose {
if c != nil {
if err := c.Close(); err != nil {
// Shouldn't happen!
w.log.Error("error closing old segment file", "err", err)
}
}
}
}
func (w *WAL) checkClosed() error {
closed := atomic.LoadUint32(&w.closed)
if closed != 0 {
return ErrClosed
}
return nil
}
// Close closes all open files related to the WAL. The WAL is in an invalid
// state and should not be used again after this is called. It is safe (though a
// no-op) to call it multiple times and concurrent reads and writes will either
// complete safely or get ErrClosed returned depending on sequencing. Generally
// reads and writes should be stopped before calling this to avoid propagating
// errors to users during shutdown but it's safe from a data-race perspective.
func (w *WAL) Close() error {
if old := atomic.SwapUint32(&w.closed, 1); old != 0 {
// Only close once
return nil
}
// Wait for writes
w.writeMu.Lock()
defer w.writeMu.Unlock()
// It doesn't matter if there is a rotation scheduled because runRotate will
// exist when it sees we are closed anyway.
w.awaitRotate = nil
// Awake and terminate the runRotate
close(w.triggerRotate)
// Replace state with nil state
s := w.loadState()
s.acquire()
defer s.release()
w.s.Store(&state{})
// Old state might be still in use by readers, attach closers to all open
// segment files.
toClose := make([]io.Closer, 0, s.segments.Len())
it := s.segments.Iterator()
for !it.Done() {
_, seg, _ := it.Next()
if seg.r != nil {
toClose = append(toClose, seg.r)
}
}
// Store finalizer to run once all readers are done. There can't be an
// existing finalizer since this was the active state read under a write
// lock and finalizers are only set on states that have been replaced under
// that same lock.
s.finalizer.Store(func() {
w.closeSegments(toClose)
})
return w.metaDB.Close()
}
// IsMonotonic implements raft.MonotonicLogStore and informs the raft library
// that this store will only allow consecutive log indexes with no gaps.
func (w *WAL) IsMonotonic() bool {
return true
}

27
vendor/golang.org/x/exp/LICENSE generated vendored Normal file
View File

@ -0,0 +1,27 @@
Copyright (c) 2009 The Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

50
vendor/golang.org/x/exp/constraints/constraints.go generated vendored Normal file
View File

@ -0,0 +1,50 @@
// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package constraints defines a set of useful constraints to be used
// with type parameters.
package constraints
// Signed is a constraint that permits any signed integer type.
// If future releases of Go add new predeclared signed integer types,
// this constraint will be modified to include them.
type Signed interface {
~int | ~int8 | ~int16 | ~int32 | ~int64
}
// Unsigned is a constraint that permits any unsigned integer type.
// If future releases of Go add new predeclared unsigned integer types,
// this constraint will be modified to include them.
type Unsigned interface {
~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr
}
// Integer is a constraint that permits any integer type.
// If future releases of Go add new predeclared integer types,
// this constraint will be modified to include them.
type Integer interface {
Signed | Unsigned
}
// Float is a constraint that permits any floating-point type.
// If future releases of Go add new predeclared floating-point types,
// this constraint will be modified to include them.
type Float interface {
~float32 | ~float64
}
// Complex is a constraint that permits any complex numeric type.
// If future releases of Go add new predeclared complex numeric types,
// this constraint will be modified to include them.
type Complex interface {
~complex64 | ~complex128
}
// Ordered is a constraint that permits any ordered type: any type
// that supports the operators < <= >= >.
// If future releases of Go add new ordered types,
// this constraint will be modified to include them.
type Ordered interface {
Integer | Float | ~string
}

23
vendor/modules.txt vendored
View File

@ -1,6 +1,9 @@
# github.com/armon/go-metrics v0.4.1
## explicit; go 1.12
github.com/armon/go-metrics
# github.com/benbjohnson/immutable v0.4.0
## explicit; go 1.18
github.com/benbjohnson/immutable
# github.com/beorn7/perks v1.0.1
## explicit; go 1.11
github.com/beorn7/perks/quantile
@ -10,13 +13,22 @@ github.com/boltdb/bolt
# github.com/cespare/xxhash/v2 v2.3.0
## explicit; go 1.11
github.com/cespare/xxhash/v2
# github.com/coreos/etcd v3.3.27+incompatible
## explicit
github.com/coreos/etcd/pkg/fileutil
# github.com/coreos/go-semver v0.3.1
## explicit; go 1.8
github.com/coreos/go-semver/semver
# github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf
## explicit
github.com/coreos/go-systemd/journal
# github.com/coreos/go-systemd/v22 v22.5.0
## explicit; go 1.12
github.com/coreos/go-systemd/v22/daemon
github.com/coreos/go-systemd/v22/journal
# github.com/coreos/pkg v0.0.0-20220810130054-c7d1c02cb6cf
## explicit
github.com/coreos/pkg/capnslog
# github.com/denisbrodbeck/machineid v1.0.1
## explicit
github.com/denisbrodbeck/machineid
@ -61,6 +73,14 @@ github.com/hashicorp/raft
# github.com/hashicorp/raft-boltdb/v2 v2.3.1
## explicit; go 1.20
github.com/hashicorp/raft-boltdb/v2
# github.com/hashicorp/raft-wal v0.4.2
## explicit; go 1.18
github.com/hashicorp/raft-wal
github.com/hashicorp/raft-wal/fs
github.com/hashicorp/raft-wal/metadb
github.com/hashicorp/raft-wal/metrics
github.com/hashicorp/raft-wal/segment
github.com/hashicorp/raft-wal/types
# github.com/json-iterator/go v1.1.12
## explicit; go 1.12
github.com/json-iterator/go
@ -156,6 +176,9 @@ go.uber.org/zap/internal/pool
go.uber.org/zap/internal/stacktrace
go.uber.org/zap/zapcore
go.uber.org/zap/zapgrpc
# golang.org/x/exp v0.0.0-20220827204233-334a2380cb91
## explicit; go 1.18
golang.org/x/exp/constraints
# golang.org/x/net v0.35.0
## explicit; go 1.18
golang.org/x/net/http/httpguts