Merge branch 'main' into feat/v2-phase1-foundation

This commit is contained in:
Michal
2026-05-05 22:06:34 +01:00
9 changed files with 430 additions and 32 deletions

View File

@@ -1,19 +1,21 @@
// Hardening: Pod Security Standards, certificate check, log rotation.
// Hardening: Pod Security Standards, certificate check, journald cap, storage.
import type { OperationContext, OperationResult, OperationGroup } from "../types.js";
import { runSequential } from "../utils.js";
import { applyPodSecurityStandards } from "../operations/pod-security.js";
import { checkCertExpiry } from "../operations/cert-check.js";
import { configureLogRotation } from "../operations/log-rotation.js";
import { configureJournaldLimits } from "../operations/journald-limits.js";
import { configureLonghornDisk } from "../operations/longhorn-disk.js";
export const hardeningGroup: OperationGroup = {
name: "hardening",
description: "Pod security, certificate check, log rotation, storage",
description: "Pod security, certificate check, journald cap, storage",
operations: [
{ name: "Apply Pod Security Standards", fn: applyPodSecurityStandards },
{ name: "Check certificate expiry", fn: checkCertExpiry },
{ name: "Configure log rotation", fn: configureLogRotation },
{ name: "Decommission file-based audit logs", fn: configureLogRotation },
{ name: "Configure journald disk cap", fn: configureJournaldLimits },
{ name: "Configure Longhorn disk", fn: configureLonghornDisk },
],
};

View File

@@ -76,7 +76,6 @@ sed -i 's/^SELINUX=enforcing/SELINUX=permissive/' /etc/selinux/config 2>/dev/nul
# ── 5b. Create k3s config directory ──
echo "[5/10] Writing k3s server configuration..."
mkdir -p /etc/rancher/k3s
mkdir -p /var/log/kubernetes
cat > /etc/rancher/k3s/config.yaml << 'K3S_CONFIG'
# k3s server configuration — CIS hardened
@@ -91,13 +90,10 @@ disable:
- servicelb
- traefik
# API server hardening
# API server hardening (audit-log-path=- routes audit to journald via stdout)
kube-apiserver-arg:
- "anonymous-auth=false"
- "audit-log-path=/var/log/kubernetes/audit.log"
- "audit-log-maxage=30"
- "audit-log-maxbackup=10"
- "audit-log-maxsize=100"
- "audit-log-path=-"
- "audit-policy-file=/etc/rancher/k3s/audit-policy.yaml"
- "enable-admission-plugins=NodeRestriction,PodSecurity"
- "request-timeout=300s"

View File

@@ -0,0 +1,194 @@
// Recover a broken etcd member by removing it from the cluster, wiping its
// local state, and restarting k3s so it rejoins as a fresh member.
//
// Use case: a node panics on startup with
// "tocommit(N+1) is out of range [lastIndex(N)]. Was the raft log corrupted,
// truncated, or lost?"
// This means the local raft WAL is missing the last entry the leader thinks
// the follower acknowledged (lost write, unclean shutdown, etc). The fix is
// always the same and well-documented; this codifies it so we don't fumble
// the procedure under pressure.
//
// Preconditions:
// - At least one healthy peer is reachable so the cluster has quorum after
// we remove the broken member. (For a 3-node cluster: 2 healthy. For a
// 5-node: 3 healthy.) If quorum would be lost, this function refuses.
// - SSH access to both the broken node and a healthy peer.
// - etcdctl available on the healthy peer (k3s does not bundle it; the
// procedure installs it on demand on Fedora).
import type { SshClient } from "../types.js";
const ETCD_TLS = {
ca: "/var/lib/rancher/k3s/server/tls/etcd/server-ca.crt",
cert: "/var/lib/rancher/k3s/server/tls/etcd/server-client.crt",
key: "/var/lib/rancher/k3s/server/tls/etcd/server-client.key",
} as const;
const SSH_TIMEOUT = 60_000;
export interface RecoverEtcdMemberOptions {
/** SSH client for the broken node (the one panicking). */
broken: SshClient;
/** SSH client for any healthy server peer in the same cluster. */
peer: SshClient;
/** Hostname (k8s node name) of the broken node. Used to find its etcd member id. */
brokenHostname: string;
/** Logger for progress output. */
log?: (msg: string) => void;
}
export interface RecoverEtcdMemberResult {
success: boolean;
changed: boolean;
message: string;
/** New etcd member id assigned after rejoin (when known). */
newMemberId?: string;
/** Old etcd member id that was removed. */
removedMemberId?: string;
error?: string;
}
function etcdctl(subcmd: string): string {
return [
"ETCDCTL_API=3 etcdctl",
`--cacert=${ETCD_TLS.ca}`,
`--cert=${ETCD_TLS.cert}`,
`--key=${ETCD_TLS.key}`,
"--endpoints=https://127.0.0.1:2379",
"--command-timeout=10s",
subcmd,
].join(" ");
}
async function ensureEtcdctl(peer: SshClient): Promise<void> {
const probe = await peer.exec("command -v etcdctl 2>/dev/null", { timeoutMs: 5_000 });
if (probe.exitCode === 0 && probe.stdout.trim()) return;
// Best-effort install on Fedora. If the host isn't dnf-based, surface the
// error to the caller via the next etcdctl invocation.
await peer.exec("dnf install -y etcd 2>&1", { timeoutMs: 120_000 });
}
async function getMemberList(peer: SshClient): Promise<Array<{ id: string; name: string }>> {
const result = await peer.exec(etcdctl("member list"), { timeoutMs: SSH_TIMEOUT });
if (result.exitCode !== 0) {
throw new Error(`etcdctl member list failed: ${result.stderr || result.stdout}`);
}
// Format: <hex-id>, started, <name>, <peer-urls>, <client-urls>, <isLearner>
return result.stdout
.split("\n")
.map((line) => line.trim())
.filter(Boolean)
.map((line) => {
const [id, , name] = line.split(",").map((p) => p.trim());
return { id: id ?? "", name: name ?? "" };
})
.filter((m) => m.id);
}
export async function recoverEtcdMember(
opts: RecoverEtcdMemberOptions,
): Promise<RecoverEtcdMemberResult> {
const log = opts.log ?? (() => {});
try {
log(`Looking up etcd member id for ${opts.brokenHostname} via peer...`);
await ensureEtcdctl(opts.peer);
const members = await getMemberList(opts.peer);
if (members.length < 3) {
return {
success: false,
changed: false,
message: "Refusing to remove a member from a cluster with <3 members (quorum would be lost)",
error: `member count = ${members.length}`,
};
}
// Member names are <hostname>-<random-suffix>; match by hostname prefix.
const broken = members.find((m) => m.name.startsWith(opts.brokenHostname));
if (!broken) {
return {
success: false,
changed: false,
message: `No etcd member found matching hostname ${opts.brokenHostname}`,
error: `members: ${members.map((m) => m.name).join(", ")}`,
};
}
log(`Broken member: ${broken.id} (${broken.name})`);
log("Step 1/4: stopping k3s on broken node");
await opts.broken.exec("systemctl stop k3s 2>&1", { timeoutMs: SSH_TIMEOUT });
log("Step 2/4: removing broken etcd member from cluster");
const remove = await opts.peer.exec(
etcdctl(`member remove ${broken.id}`),
{ timeoutMs: SSH_TIMEOUT },
);
if (remove.exitCode !== 0) {
return {
success: false,
changed: false,
message: "etcdctl member remove failed",
error: remove.stderr || remove.stdout,
removedMemberId: broken.id,
};
}
log("Step 3/4: archiving corrupt etcd state and stale TLS/cred dirs on broken node");
const ts = Math.floor(Date.now() / 1000);
await opts.broken.exec(
[
`mv /var/lib/rancher/k3s/server/db /var/lib/rancher/k3s/server/db.corrupt-${ts} 2>/dev/null || true`,
"rm -rf /var/lib/rancher/k3s/server/tls /var/lib/rancher/k3s/server/cred",
].join(" && "),
{ timeoutMs: SSH_TIMEOUT },
);
log("Step 4/4: starting k3s on broken node — it will rejoin");
await opts.broken.exec("systemctl start k3s 2>&1", { timeoutMs: SSH_TIMEOUT });
// Poll for rejoin. The new member-id is what the cluster assigns on join.
let newMemberId: string | undefined;
for (let i = 0; i < 60; i++) {
await new Promise((r) => setTimeout(r, 5_000));
try {
const after = await getMemberList(opts.peer);
const rejoined = after.find(
(m) => m.name.startsWith(opts.brokenHostname) && m.id !== broken.id,
);
if (rejoined) {
newMemberId = rejoined.id;
break;
}
} catch {
// peer may briefly be unreachable mid-rejoin — keep polling
}
}
if (!newMemberId) {
return {
success: false,
changed: true,
message: "k3s started but new member did not appear in cluster within 5 minutes",
removedMemberId: broken.id,
};
}
log(`Rejoined as ${newMemberId}`);
return {
success: true,
changed: true,
message: `Recovered: removed ${broken.id}, rejoined as ${newMemberId}`,
removedMemberId: broken.id,
newMemberId,
};
} catch (err) {
return {
success: false,
changed: false,
message: "Recovery failed",
error: err instanceof Error ? err.message : String(err),
};
}
}

View File

@@ -11,7 +11,13 @@ export { installK3sBinary } from "./k3s-install.js";
export { installCilium } from "./cilium.js";
export { fixCoreDnsUpstream } from "./dns-fix.js";
export { configureLogRotation } from "./log-rotation.js";
export { configureJournaldLimits } from "./journald-limits.js";
export { applyDefaultNetworkPolicies } from "./network-policy.js";
export { applyPodSecurityStandards } from "./pod-security.js";
export { checkCertExpiry } from "./cert-check.js";
export { configureLonghornDisk } from "./longhorn-disk.js";
export { recoverEtcdMember } from "./etcd-recover.js";
export type {
RecoverEtcdMemberOptions,
RecoverEtcdMemberResult,
} from "./etcd-recover.js";

View File

@@ -0,0 +1,33 @@
// Cap journald disk usage so audit logs (which now flow through journald via
// kube-apiserver's stdout) cannot fill /var/log. Default journald uses up to
// 10% of the filesystem, capped at 4 GB. In a /var/log of ~10 GB shared with
// other services, that's still room for audit volume to evict useful logs.
// 2 GB / 200 MB-per-file is a comfortable middle.
import type { Operation, OperationResult } from "../types.js";
import { sshOpts, writeRemoteFile } from "../utils.js";
const DROPIN_CONTENT = `[Journal]
SystemMaxUse=2G
SystemKeepFree=1G
SystemMaxFileSize=200M
`;
const DROPIN_PATH = "/etc/systemd/journald.conf.d/10-k3s-audit-cap.conf";
export const configureJournaldLimits: Operation = async (ctx): Promise<OperationResult> => {
const changed = await writeRemoteFile(ctx, DROPIN_PATH, DROPIN_CONTENT);
if (changed) {
// Reload journald so the new limit applies without a reboot.
await ctx.ssh.exec(
"systemctl kill --signal=SIGUSR2 systemd-journald 2>/dev/null; " +
"systemctl restart systemd-journald 2>&1 || true",
sshOpts(ctx),
);
}
return {
success: true,
changed,
message: changed ? "journald limits configured (2 GB cap)" : "journald limits already configured",
};
};

View File

@@ -13,6 +13,12 @@ function generateServerConfig(config: K3sConfig): string {
const clusterLines = isJoining
? `server: "${config.k3sServerUrl}"\ntoken: "${config.k3sToken}"`
: "cluster-init: true";
// audit-log-path=- routes audit events to k3s.service's stdout, which systemd
// forwards to journald. journald enforces its own size caps (see
// configureJournaldLimits) so audit volume cannot fill the disk. File-based
// audit logs led to /var/log/kubernetes growing to 7+ GB because apiserver's
// own rotation produced files that any logrotate glob would double-rotate
// and never expire.
return `# k3s server configuration — CIS hardened, etcd HA
${clusterLines}
protect-kernel-defaults: true
@@ -30,10 +36,7 @@ node-label:
kube-apiserver-arg:
- "anonymous-auth=false"
- "audit-log-path=/var/log/kubernetes/audit.log"
- "audit-log-maxage=30"
- "audit-log-maxbackup=10"
- "audit-log-maxsize=100"
- "audit-log-path=-"
- "audit-policy-file=/etc/rancher/k3s/audit-policy.yaml"
- "enable-admission-plugins=NodeRestriction,PodSecurity"
- "request-timeout=300s"
@@ -61,7 +64,7 @@ kubelet-arg:
}
export const writeK3sConfig: Operation = async (ctx): Promise<OperationResult> => {
await ctx.ssh.exec("mkdir -p /etc/rancher/k3s /var/log/kubernetes", sshOpts(ctx));
await ctx.ssh.exec("mkdir -p /etc/rancher/k3s", sshOpts(ctx));
const content = isServerRole(ctx.config.role)
? generateServerConfig(ctx.config)

View File

@@ -1,25 +1,44 @@
// Configure log rotation for k3s.
// Decommission file-based k8s audit logging in favor of journald.
//
// Earlier versions wrote audit events to /var/log/kubernetes/audit.log and
// rotated them with a logrotate rule. Two failure modes followed: kube-apiserver
// rotated internally (audit-{ts}.log), the *.log glob in logrotate
// double-rotated those (-{date}), and the resulting filename matched no
// retention policy, so the directory grew unbounded (we observed 7+ GB).
//
// k3s now sets audit-log-path=- so audit goes to stdout → journald, which
// enforces SystemMaxUse caps. This operation removes the obsolete logrotate
// rule and reaps any audit files left behind by the old setup. Idempotent: on
// fresh installs everything is already absent and the operation is a no-op.
import type { Operation, OperationResult } from "../types.js";
import { writeRemoteFile } from "../utils.js";
import { sshOpts } from "../utils.js";
const LOGROTATE_CONFIG = `/var/log/kubernetes/*.log {
daily
rotate 14
compress
delaycompress
missingok
notifempty
copytruncate
maxsize 100M
}`;
const REMOVE_LOGROTATE = "rm -f /etc/logrotate.d/k3s";
// Bounded by a max-depth and explicit name pattern so we never reach outside
// the deprecated audit-log directory.
const REAP_OLD_AUDIT_FILES =
"find /var/log/kubernetes -maxdepth 1 -type f " +
"\\( -name 'audit*.log*' -o -name 'audit-*.log' \\) " +
"-delete 2>/dev/null; " +
"rmdir /var/log/kubernetes 2>/dev/null; true";
export const configureLogRotation: Operation = async (ctx): Promise<OperationResult> => {
const changed = await writeRemoteFile(ctx, "/etc/logrotate.d/k3s", LOGROTATE_CONFIG);
const before = await ctx.ssh.exec(
"test -e /etc/logrotate.d/k3s -o -d /var/log/kubernetes && echo present || echo absent",
sshOpts(ctx),
);
const wasPresent = before.stdout.trim() === "present";
await ctx.ssh.exec(REMOVE_LOGROTATE, sshOpts(ctx));
await ctx.ssh.exec(REAP_OLD_AUDIT_FILES, sshOpts(ctx));
return {
success: true,
changed,
message: changed ? "Log rotation configured" : "Log rotation already configured",
changed: wasPresent,
message: wasPresent
? "Removed legacy file-based audit logging (now via journald)"
: "No legacy audit log artifacts present",
};
};

View File

@@ -71,9 +71,14 @@ describe("k3s install script — server role", () => {
expect(script).toContain("enable-admission-plugins=NodeRestriction,PodSecurity");
});
it("configures audit logging", () => {
expect(script).toContain("audit-log-path=/var/log/kubernetes/audit.log");
expect(script).toContain("audit-log-maxage=30");
it("configures audit logging via journald (stdout)", () => {
expect(script).toContain("audit-log-path=-");
// file-based fields and the now-obsolete log directory must be gone
expect(script).not.toContain("/var/log/kubernetes/audit.log");
expect(script).not.toContain("audit-log-maxage");
expect(script).not.toContain("audit-log-maxbackup");
expect(script).not.toContain("audit-log-maxsize");
expect(script).not.toContain("mkdir -p /var/log/kubernetes");
});
it("cleans stale flannel vxlan before Cilium install", () => {

View File

@@ -348,3 +348,143 @@ describe("applyPodSecurityStandards", () => {
expectCommand(ctx.ssh, "pod-security.kubernetes.io/audit=restricted");
});
});
// --- Audit Logging Decommission (file-based → journald) ---
import { configureLogRotation } from "../src/operations/log-rotation.js";
import { configureJournaldLimits } from "../src/operations/journald-limits.js";
describe("configureLogRotation (decommission file-based audit logs)", () => {
it("removes the legacy logrotate rule and reaps obsolete audit files", async () => {
const ctx = mockCtx();
ctx.ssh.exec.mockResolvedValueOnce(stdout("present")); // probe: legacy artifacts exist
ctx.ssh.exec.mockResolvedValue(OK);
const result = await configureLogRotation(ctx);
expect(result.success).toBe(true);
expect(result.changed).toBe(true);
expectCommand(ctx.ssh, "rm -f /etc/logrotate.d/k3s");
expectCommand(ctx.ssh, /find \/var\/log\/kubernetes.*audit.*-delete/);
expectCommand(ctx.ssh, "rmdir /var/log/kubernetes");
});
it("is a no-op when nothing legacy is present", async () => {
const ctx = mockCtx();
ctx.ssh.exec.mockResolvedValueOnce(stdout("absent"));
ctx.ssh.exec.mockResolvedValue(OK);
const result = await configureLogRotation(ctx);
expect(result.success).toBe(true);
expect(result.changed).toBe(false);
});
});
describe("configureJournaldLimits", () => {
it("writes a 2 GB SystemMaxUse drop-in and reloads journald when changed", async () => {
const ctx = mockCtx();
ctx.ssh.exec.mockResolvedValueOnce(stdout("__LABCTL_NOT_FOUND__")); // no existing drop-in
ctx.ssh.exec.mockResolvedValue(OK);
const result = await configureJournaldLimits(ctx);
expect(result.success).toBe(true);
expect(result.changed).toBe(true);
const writeCall = ctx.ssh.exec.mock.calls.find((c) => {
const cmd = c[0] as string;
return cmd.includes("10-k3s-audit-cap.conf") && cmd.includes("LABCTL_EOF");
});
expect(writeCall).toBeTruthy();
const written = writeCall?.[0] as string;
expect(written).toContain("SystemMaxUse=2G");
expect(written).toContain("SystemKeepFree=1G");
expectCommand(ctx.ssh, "systemctl restart systemd-journald");
});
it("does not restart journald when the drop-in is already correct", async () => {
const ctx = mockCtx();
const existing =
"[Journal]\nSystemMaxUse=2G\nSystemKeepFree=1G\nSystemMaxFileSize=200M\n";
ctx.ssh.exec.mockResolvedValueOnce(stdout(existing));
ctx.ssh.exec.mockResolvedValue(OK);
const result = await configureJournaldLimits(ctx);
expect(result.success).toBe(true);
expect(result.changed).toBe(false);
expectNoCommand(ctx.ssh, "systemctl restart systemd-journald");
});
});
// --- Etcd Recovery ---
import { recoverEtcdMember } from "../src/operations/etcd-recover.js";
import { mockSsh } from "./helpers.js";
describe("recoverEtcdMember", () => {
it("refuses to operate when cluster is below 3 members (quorum risk)", async () => {
const broken = mockSsh();
const peer = mockSsh();
peer.exec.mockResolvedValueOnce(stdout("/usr/bin/etcdctl")); // etcdctl present
peer.exec.mockResolvedValueOnce(stdout(
"111, started, host-a-aaa, https://10.0.0.1:2380, https://10.0.0.1:2379, false\n" +
"222, started, host-b-bbb, https://10.0.0.2:2380, https://10.0.0.2:2379, false",
));
const result = await recoverEtcdMember({ broken, peer, brokenHostname: "host-b" });
expect(result.success).toBe(false);
expect(result.message).toMatch(/quorum/i);
// Critically: must NOT have stopped k3s or removed anything
expect(broken.exec).not.toHaveBeenCalledWith(expect.stringContaining("systemctl stop k3s"), expect.anything());
});
it("performs full procedure when quorum is preserved", async () => {
const broken = mockSsh();
const peer = mockSsh();
// ensureEtcdctl: present
peer.exec.mockResolvedValueOnce(stdout("/usr/bin/etcdctl"));
// member list (3 members, target = host-b)
peer.exec.mockResolvedValueOnce(stdout(
"111, started, host-a-aaa, https://10.0.0.1:2380, https://10.0.0.1:2379, false\n" +
"222, started, host-b-bbb, https://10.0.0.2:2380, https://10.0.0.2:2379, false\n" +
"333, started, host-c-ccc, https://10.0.0.3:2380, https://10.0.0.3:2379, false",
));
// member remove
peer.exec.mockResolvedValueOnce(stdout("Member 222 removed"));
// post-rejoin member list — new id 444 for host-b
peer.exec.mockResolvedValueOnce(stdout(
"111, started, host-a-aaa, https://10.0.0.1:2380, https://10.0.0.1:2379, false\n" +
"333, started, host-c-ccc, https://10.0.0.3:2380, https://10.0.0.3:2379, false\n" +
"444, started, host-b-zzz, https://10.0.0.2:2380, https://10.0.0.2:2379, false",
));
const result = await recoverEtcdMember({ broken, peer, brokenHostname: "host-b" });
expect(result.success).toBe(true);
expect(result.removedMemberId).toBe("222");
expect(result.newMemberId).toBe("444");
expectCommand(broken,"systemctl stop k3s");
expectCommand(peer,"member remove 222");
expectCommand(broken,/db\.corrupt-/);
expectCommand(broken,/rm -rf .*\/server\/tls/);
expectCommand(broken,"systemctl start k3s");
});
it("fails clearly when no member matches the broken hostname", async () => {
const broken = mockSsh();
const peer = mockSsh();
peer.exec.mockResolvedValueOnce(stdout("/usr/bin/etcdctl"));
peer.exec.mockResolvedValueOnce(stdout(
"111, started, host-a-aaa, https://10.0.0.1:2380, https://10.0.0.1:2379, false\n" +
"222, started, host-b-bbb, https://10.0.0.2:2380, https://10.0.0.2:2379, false\n" +
"333, started, host-c-ccc, https://10.0.0.3:2380, https://10.0.0.3:2379, false",
));
const result = await recoverEtcdMember({ broken, peer, brokenHostname: "host-d" });
expect(result.success).toBe(false);
expect(result.message).toMatch(/No etcd member found/);
expect(broken.exec).not.toHaveBeenCalledWith(expect.stringContaining("systemctl stop k3s"), expect.anything());
});
});