Compare commits
2 Commits
feat/reche
...
fix/k3s-au
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dd92147341 | ||
| 95c99cb4d5 |
@@ -1,19 +1,21 @@
|
|||||||
// Hardening: Pod Security Standards, certificate check, log rotation.
|
// Hardening: Pod Security Standards, certificate check, journald cap, storage.
|
||||||
|
|
||||||
import type { OperationContext, OperationResult, OperationGroup } from "../types.js";
|
import type { OperationContext, OperationResult, OperationGroup } from "../types.js";
|
||||||
import { runSequential } from "../utils.js";
|
import { runSequential } from "../utils.js";
|
||||||
import { applyPodSecurityStandards } from "../operations/pod-security.js";
|
import { applyPodSecurityStandards } from "../operations/pod-security.js";
|
||||||
import { checkCertExpiry } from "../operations/cert-check.js";
|
import { checkCertExpiry } from "../operations/cert-check.js";
|
||||||
import { configureLogRotation } from "../operations/log-rotation.js";
|
import { configureLogRotation } from "../operations/log-rotation.js";
|
||||||
|
import { configureJournaldLimits } from "../operations/journald-limits.js";
|
||||||
import { configureLonghornDisk } from "../operations/longhorn-disk.js";
|
import { configureLonghornDisk } from "../operations/longhorn-disk.js";
|
||||||
|
|
||||||
export const hardeningGroup: OperationGroup = {
|
export const hardeningGroup: OperationGroup = {
|
||||||
name: "hardening",
|
name: "hardening",
|
||||||
description: "Pod security, certificate check, log rotation, storage",
|
description: "Pod security, certificate check, journald cap, storage",
|
||||||
operations: [
|
operations: [
|
||||||
{ name: "Apply Pod Security Standards", fn: applyPodSecurityStandards },
|
{ name: "Apply Pod Security Standards", fn: applyPodSecurityStandards },
|
||||||
{ name: "Check certificate expiry", fn: checkCertExpiry },
|
{ name: "Check certificate expiry", fn: checkCertExpiry },
|
||||||
{ name: "Configure log rotation", fn: configureLogRotation },
|
{ name: "Decommission file-based audit logs", fn: configureLogRotation },
|
||||||
|
{ name: "Configure journald disk cap", fn: configureJournaldLimits },
|
||||||
{ name: "Configure Longhorn disk", fn: configureLonghornDisk },
|
{ name: "Configure Longhorn disk", fn: configureLonghornDisk },
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -76,7 +76,6 @@ sed -i 's/^SELINUX=enforcing/SELINUX=permissive/' /etc/selinux/config 2>/dev/nul
|
|||||||
# ── 5b. Create k3s config directory ──
|
# ── 5b. Create k3s config directory ──
|
||||||
echo "[5/10] Writing k3s server configuration..."
|
echo "[5/10] Writing k3s server configuration..."
|
||||||
mkdir -p /etc/rancher/k3s
|
mkdir -p /etc/rancher/k3s
|
||||||
mkdir -p /var/log/kubernetes
|
|
||||||
|
|
||||||
cat > /etc/rancher/k3s/config.yaml << 'K3S_CONFIG'
|
cat > /etc/rancher/k3s/config.yaml << 'K3S_CONFIG'
|
||||||
# k3s server configuration — CIS hardened
|
# k3s server configuration — CIS hardened
|
||||||
@@ -91,13 +90,10 @@ disable:
|
|||||||
- servicelb
|
- servicelb
|
||||||
- traefik
|
- traefik
|
||||||
|
|
||||||
# API server hardening
|
# API server hardening (audit-log-path=- routes audit to journald via stdout)
|
||||||
kube-apiserver-arg:
|
kube-apiserver-arg:
|
||||||
- "anonymous-auth=false"
|
- "anonymous-auth=false"
|
||||||
- "audit-log-path=/var/log/kubernetes/audit.log"
|
- "audit-log-path=-"
|
||||||
- "audit-log-maxage=30"
|
|
||||||
- "audit-log-maxbackup=10"
|
|
||||||
- "audit-log-maxsize=100"
|
|
||||||
- "audit-policy-file=/etc/rancher/k3s/audit-policy.yaml"
|
- "audit-policy-file=/etc/rancher/k3s/audit-policy.yaml"
|
||||||
- "enable-admission-plugins=NodeRestriction,PodSecurity"
|
- "enable-admission-plugins=NodeRestriction,PodSecurity"
|
||||||
- "request-timeout=300s"
|
- "request-timeout=300s"
|
||||||
|
|||||||
194
bastion/src/modules/modules/k3s/src/operations/etcd-recover.ts
Normal file
194
bastion/src/modules/modules/k3s/src/operations/etcd-recover.ts
Normal file
@@ -0,0 +1,194 @@
|
|||||||
|
// Recover a broken etcd member by removing it from the cluster, wiping its
|
||||||
|
// local state, and restarting k3s so it rejoins as a fresh member.
|
||||||
|
//
|
||||||
|
// Use case: a node panics on startup with
|
||||||
|
// "tocommit(N+1) is out of range [lastIndex(N)]. Was the raft log corrupted,
|
||||||
|
// truncated, or lost?"
|
||||||
|
// This means the local raft WAL is missing the last entry the leader thinks
|
||||||
|
// the follower acknowledged (lost write, unclean shutdown, etc). The fix is
|
||||||
|
// always the same and well-documented; this codifies it so we don't fumble
|
||||||
|
// the procedure under pressure.
|
||||||
|
//
|
||||||
|
// Preconditions:
|
||||||
|
// - At least one healthy peer is reachable so the cluster has quorum after
|
||||||
|
// we remove the broken member. (For a 3-node cluster: 2 healthy. For a
|
||||||
|
// 5-node: 3 healthy.) If quorum would be lost, this function refuses.
|
||||||
|
// - SSH access to both the broken node and a healthy peer.
|
||||||
|
// - etcdctl available on the healthy peer (k3s does not bundle it; the
|
||||||
|
// procedure installs it on demand on Fedora).
|
||||||
|
|
||||||
|
import type { SshClient } from "../types.js";
|
||||||
|
|
||||||
|
const ETCD_TLS = {
|
||||||
|
ca: "/var/lib/rancher/k3s/server/tls/etcd/server-ca.crt",
|
||||||
|
cert: "/var/lib/rancher/k3s/server/tls/etcd/server-client.crt",
|
||||||
|
key: "/var/lib/rancher/k3s/server/tls/etcd/server-client.key",
|
||||||
|
} as const;
|
||||||
|
|
||||||
|
const SSH_TIMEOUT = 60_000;
|
||||||
|
|
||||||
|
export interface RecoverEtcdMemberOptions {
|
||||||
|
/** SSH client for the broken node (the one panicking). */
|
||||||
|
broken: SshClient;
|
||||||
|
/** SSH client for any healthy server peer in the same cluster. */
|
||||||
|
peer: SshClient;
|
||||||
|
/** Hostname (k8s node name) of the broken node. Used to find its etcd member id. */
|
||||||
|
brokenHostname: string;
|
||||||
|
/** Logger for progress output. */
|
||||||
|
log?: (msg: string) => void;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface RecoverEtcdMemberResult {
|
||||||
|
success: boolean;
|
||||||
|
changed: boolean;
|
||||||
|
message: string;
|
||||||
|
/** New etcd member id assigned after rejoin (when known). */
|
||||||
|
newMemberId?: string;
|
||||||
|
/** Old etcd member id that was removed. */
|
||||||
|
removedMemberId?: string;
|
||||||
|
error?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
function etcdctl(subcmd: string): string {
|
||||||
|
return [
|
||||||
|
"ETCDCTL_API=3 etcdctl",
|
||||||
|
`--cacert=${ETCD_TLS.ca}`,
|
||||||
|
`--cert=${ETCD_TLS.cert}`,
|
||||||
|
`--key=${ETCD_TLS.key}`,
|
||||||
|
"--endpoints=https://127.0.0.1:2379",
|
||||||
|
"--command-timeout=10s",
|
||||||
|
subcmd,
|
||||||
|
].join(" ");
|
||||||
|
}
|
||||||
|
|
||||||
|
async function ensureEtcdctl(peer: SshClient): Promise<void> {
|
||||||
|
const probe = await peer.exec("command -v etcdctl 2>/dev/null", { timeoutMs: 5_000 });
|
||||||
|
if (probe.exitCode === 0 && probe.stdout.trim()) return;
|
||||||
|
// Best-effort install on Fedora. If the host isn't dnf-based, surface the
|
||||||
|
// error to the caller via the next etcdctl invocation.
|
||||||
|
await peer.exec("dnf install -y etcd 2>&1", { timeoutMs: 120_000 });
|
||||||
|
}
|
||||||
|
|
||||||
|
async function getMemberList(peer: SshClient): Promise<Array<{ id: string; name: string }>> {
|
||||||
|
const result = await peer.exec(etcdctl("member list"), { timeoutMs: SSH_TIMEOUT });
|
||||||
|
if (result.exitCode !== 0) {
|
||||||
|
throw new Error(`etcdctl member list failed: ${result.stderr || result.stdout}`);
|
||||||
|
}
|
||||||
|
// Format: <hex-id>, started, <name>, <peer-urls>, <client-urls>, <isLearner>
|
||||||
|
return result.stdout
|
||||||
|
.split("\n")
|
||||||
|
.map((line) => line.trim())
|
||||||
|
.filter(Boolean)
|
||||||
|
.map((line) => {
|
||||||
|
const [id, , name] = line.split(",").map((p) => p.trim());
|
||||||
|
return { id: id ?? "", name: name ?? "" };
|
||||||
|
})
|
||||||
|
.filter((m) => m.id);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function recoverEtcdMember(
|
||||||
|
opts: RecoverEtcdMemberOptions,
|
||||||
|
): Promise<RecoverEtcdMemberResult> {
|
||||||
|
const log = opts.log ?? (() => {});
|
||||||
|
|
||||||
|
try {
|
||||||
|
log(`Looking up etcd member id for ${opts.brokenHostname} via peer...`);
|
||||||
|
await ensureEtcdctl(opts.peer);
|
||||||
|
|
||||||
|
const members = await getMemberList(opts.peer);
|
||||||
|
if (members.length < 3) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
changed: false,
|
||||||
|
message: "Refusing to remove a member from a cluster with <3 members (quorum would be lost)",
|
||||||
|
error: `member count = ${members.length}`,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Member names are <hostname>-<random-suffix>; match by hostname prefix.
|
||||||
|
const broken = members.find((m) => m.name.startsWith(opts.brokenHostname));
|
||||||
|
if (!broken) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
changed: false,
|
||||||
|
message: `No etcd member found matching hostname ${opts.brokenHostname}`,
|
||||||
|
error: `members: ${members.map((m) => m.name).join(", ")}`,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
log(`Broken member: ${broken.id} (${broken.name})`);
|
||||||
|
|
||||||
|
log("Step 1/4: stopping k3s on broken node");
|
||||||
|
await opts.broken.exec("systemctl stop k3s 2>&1", { timeoutMs: SSH_TIMEOUT });
|
||||||
|
|
||||||
|
log("Step 2/4: removing broken etcd member from cluster");
|
||||||
|
const remove = await opts.peer.exec(
|
||||||
|
etcdctl(`member remove ${broken.id}`),
|
||||||
|
{ timeoutMs: SSH_TIMEOUT },
|
||||||
|
);
|
||||||
|
if (remove.exitCode !== 0) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
changed: false,
|
||||||
|
message: "etcdctl member remove failed",
|
||||||
|
error: remove.stderr || remove.stdout,
|
||||||
|
removedMemberId: broken.id,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
log("Step 3/4: archiving corrupt etcd state and stale TLS/cred dirs on broken node");
|
||||||
|
const ts = Math.floor(Date.now() / 1000);
|
||||||
|
await opts.broken.exec(
|
||||||
|
[
|
||||||
|
`mv /var/lib/rancher/k3s/server/db /var/lib/rancher/k3s/server/db.corrupt-${ts} 2>/dev/null || true`,
|
||||||
|
"rm -rf /var/lib/rancher/k3s/server/tls /var/lib/rancher/k3s/server/cred",
|
||||||
|
].join(" && "),
|
||||||
|
{ timeoutMs: SSH_TIMEOUT },
|
||||||
|
);
|
||||||
|
|
||||||
|
log("Step 4/4: starting k3s on broken node — it will rejoin");
|
||||||
|
await opts.broken.exec("systemctl start k3s 2>&1", { timeoutMs: SSH_TIMEOUT });
|
||||||
|
|
||||||
|
// Poll for rejoin. The new member-id is what the cluster assigns on join.
|
||||||
|
let newMemberId: string | undefined;
|
||||||
|
for (let i = 0; i < 60; i++) {
|
||||||
|
await new Promise((r) => setTimeout(r, 5_000));
|
||||||
|
try {
|
||||||
|
const after = await getMemberList(opts.peer);
|
||||||
|
const rejoined = after.find(
|
||||||
|
(m) => m.name.startsWith(opts.brokenHostname) && m.id !== broken.id,
|
||||||
|
);
|
||||||
|
if (rejoined) {
|
||||||
|
newMemberId = rejoined.id;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// peer may briefly be unreachable mid-rejoin — keep polling
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!newMemberId) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
changed: true,
|
||||||
|
message: "k3s started but new member did not appear in cluster within 5 minutes",
|
||||||
|
removedMemberId: broken.id,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
log(`Rejoined as ${newMemberId}`);
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
changed: true,
|
||||||
|
message: `Recovered: removed ${broken.id}, rejoined as ${newMemberId}`,
|
||||||
|
removedMemberId: broken.id,
|
||||||
|
newMemberId,
|
||||||
|
};
|
||||||
|
} catch (err) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
changed: false,
|
||||||
|
message: "Recovery failed",
|
||||||
|
error: err instanceof Error ? err.message : String(err),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -11,7 +11,13 @@ export { installK3sBinary } from "./k3s-install.js";
|
|||||||
export { installCilium } from "./cilium.js";
|
export { installCilium } from "./cilium.js";
|
||||||
export { fixCoreDnsUpstream } from "./dns-fix.js";
|
export { fixCoreDnsUpstream } from "./dns-fix.js";
|
||||||
export { configureLogRotation } from "./log-rotation.js";
|
export { configureLogRotation } from "./log-rotation.js";
|
||||||
|
export { configureJournaldLimits } from "./journald-limits.js";
|
||||||
export { applyDefaultNetworkPolicies } from "./network-policy.js";
|
export { applyDefaultNetworkPolicies } from "./network-policy.js";
|
||||||
export { applyPodSecurityStandards } from "./pod-security.js";
|
export { applyPodSecurityStandards } from "./pod-security.js";
|
||||||
export { checkCertExpiry } from "./cert-check.js";
|
export { checkCertExpiry } from "./cert-check.js";
|
||||||
export { configureLonghornDisk } from "./longhorn-disk.js";
|
export { configureLonghornDisk } from "./longhorn-disk.js";
|
||||||
|
export { recoverEtcdMember } from "./etcd-recover.js";
|
||||||
|
export type {
|
||||||
|
RecoverEtcdMemberOptions,
|
||||||
|
RecoverEtcdMemberResult,
|
||||||
|
} from "./etcd-recover.js";
|
||||||
|
|||||||
@@ -0,0 +1,33 @@
|
|||||||
|
// Cap journald disk usage so audit logs (which now flow through journald via
|
||||||
|
// kube-apiserver's stdout) cannot fill /var/log. Default journald uses up to
|
||||||
|
// 10% of the filesystem, capped at 4 GB. In a /var/log of ~10 GB shared with
|
||||||
|
// other services, that's still room for audit volume to evict useful logs.
|
||||||
|
// 2 GB / 200 MB-per-file is a comfortable middle.
|
||||||
|
|
||||||
|
import type { Operation, OperationResult } from "../types.js";
|
||||||
|
import { sshOpts, writeRemoteFile } from "../utils.js";
|
||||||
|
|
||||||
|
const DROPIN_CONTENT = `[Journal]
|
||||||
|
SystemMaxUse=2G
|
||||||
|
SystemKeepFree=1G
|
||||||
|
SystemMaxFileSize=200M
|
||||||
|
`;
|
||||||
|
|
||||||
|
const DROPIN_PATH = "/etc/systemd/journald.conf.d/10-k3s-audit-cap.conf";
|
||||||
|
|
||||||
|
export const configureJournaldLimits: Operation = async (ctx): Promise<OperationResult> => {
|
||||||
|
const changed = await writeRemoteFile(ctx, DROPIN_PATH, DROPIN_CONTENT);
|
||||||
|
if (changed) {
|
||||||
|
// Reload journald so the new limit applies without a reboot.
|
||||||
|
await ctx.ssh.exec(
|
||||||
|
"systemctl kill --signal=SIGUSR2 systemd-journald 2>/dev/null; " +
|
||||||
|
"systemctl restart systemd-journald 2>&1 || true",
|
||||||
|
sshOpts(ctx),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
changed,
|
||||||
|
message: changed ? "journald limits configured (2 GB cap)" : "journald limits already configured",
|
||||||
|
};
|
||||||
|
};
|
||||||
@@ -13,6 +13,12 @@ function generateServerConfig(config: K3sConfig): string {
|
|||||||
const clusterLines = isJoining
|
const clusterLines = isJoining
|
||||||
? `server: "${config.k3sServerUrl}"\ntoken: "${config.k3sToken}"`
|
? `server: "${config.k3sServerUrl}"\ntoken: "${config.k3sToken}"`
|
||||||
: "cluster-init: true";
|
: "cluster-init: true";
|
||||||
|
// audit-log-path=- routes audit events to k3s.service's stdout, which systemd
|
||||||
|
// forwards to journald. journald enforces its own size caps (see
|
||||||
|
// configureJournaldLimits) so audit volume cannot fill the disk. File-based
|
||||||
|
// audit logs led to /var/log/kubernetes growing to 7+ GB because apiserver's
|
||||||
|
// own rotation produced files that any logrotate glob would double-rotate
|
||||||
|
// and never expire.
|
||||||
return `# k3s server configuration — CIS hardened, etcd HA
|
return `# k3s server configuration — CIS hardened, etcd HA
|
||||||
${clusterLines}
|
${clusterLines}
|
||||||
protect-kernel-defaults: true
|
protect-kernel-defaults: true
|
||||||
@@ -30,10 +36,7 @@ node-label:
|
|||||||
|
|
||||||
kube-apiserver-arg:
|
kube-apiserver-arg:
|
||||||
- "anonymous-auth=false"
|
- "anonymous-auth=false"
|
||||||
- "audit-log-path=/var/log/kubernetes/audit.log"
|
- "audit-log-path=-"
|
||||||
- "audit-log-maxage=30"
|
|
||||||
- "audit-log-maxbackup=10"
|
|
||||||
- "audit-log-maxsize=100"
|
|
||||||
- "audit-policy-file=/etc/rancher/k3s/audit-policy.yaml"
|
- "audit-policy-file=/etc/rancher/k3s/audit-policy.yaml"
|
||||||
- "enable-admission-plugins=NodeRestriction,PodSecurity"
|
- "enable-admission-plugins=NodeRestriction,PodSecurity"
|
||||||
- "request-timeout=300s"
|
- "request-timeout=300s"
|
||||||
@@ -61,7 +64,7 @@ kubelet-arg:
|
|||||||
}
|
}
|
||||||
|
|
||||||
export const writeK3sConfig: Operation = async (ctx): Promise<OperationResult> => {
|
export const writeK3sConfig: Operation = async (ctx): Promise<OperationResult> => {
|
||||||
await ctx.ssh.exec("mkdir -p /etc/rancher/k3s /var/log/kubernetes", sshOpts(ctx));
|
await ctx.ssh.exec("mkdir -p /etc/rancher/k3s", sshOpts(ctx));
|
||||||
|
|
||||||
const content = isServerRole(ctx.config.role)
|
const content = isServerRole(ctx.config.role)
|
||||||
? generateServerConfig(ctx.config)
|
? generateServerConfig(ctx.config)
|
||||||
|
|||||||
@@ -1,25 +1,44 @@
|
|||||||
// Configure log rotation for k3s.
|
// Decommission file-based k8s audit logging in favor of journald.
|
||||||
|
//
|
||||||
|
// Earlier versions wrote audit events to /var/log/kubernetes/audit.log and
|
||||||
|
// rotated them with a logrotate rule. Two failure modes followed: kube-apiserver
|
||||||
|
// rotated internally (audit-{ts}.log), the *.log glob in logrotate
|
||||||
|
// double-rotated those (-{date}), and the resulting filename matched no
|
||||||
|
// retention policy, so the directory grew unbounded (we observed 7+ GB).
|
||||||
|
//
|
||||||
|
// k3s now sets audit-log-path=- so audit goes to stdout → journald, which
|
||||||
|
// enforces SystemMaxUse caps. This operation removes the obsolete logrotate
|
||||||
|
// rule and reaps any audit files left behind by the old setup. Idempotent: on
|
||||||
|
// fresh installs everything is already absent and the operation is a no-op.
|
||||||
|
|
||||||
import type { Operation, OperationResult } from "../types.js";
|
import type { Operation, OperationResult } from "../types.js";
|
||||||
import { writeRemoteFile } from "../utils.js";
|
import { sshOpts } from "../utils.js";
|
||||||
|
|
||||||
const LOGROTATE_CONFIG = `/var/log/kubernetes/*.log {
|
const REMOVE_LOGROTATE = "rm -f /etc/logrotate.d/k3s";
|
||||||
daily
|
|
||||||
rotate 14
|
// Bounded by a max-depth and explicit name pattern so we never reach outside
|
||||||
compress
|
// the deprecated audit-log directory.
|
||||||
delaycompress
|
const REAP_OLD_AUDIT_FILES =
|
||||||
missingok
|
"find /var/log/kubernetes -maxdepth 1 -type f " +
|
||||||
notifempty
|
"\\( -name 'audit*.log*' -o -name 'audit-*.log' \\) " +
|
||||||
copytruncate
|
"-delete 2>/dev/null; " +
|
||||||
maxsize 100M
|
"rmdir /var/log/kubernetes 2>/dev/null; true";
|
||||||
}`;
|
|
||||||
|
|
||||||
export const configureLogRotation: Operation = async (ctx): Promise<OperationResult> => {
|
export const configureLogRotation: Operation = async (ctx): Promise<OperationResult> => {
|
||||||
const changed = await writeRemoteFile(ctx, "/etc/logrotate.d/k3s", LOGROTATE_CONFIG);
|
const before = await ctx.ssh.exec(
|
||||||
|
"test -e /etc/logrotate.d/k3s -o -d /var/log/kubernetes && echo present || echo absent",
|
||||||
|
sshOpts(ctx),
|
||||||
|
);
|
||||||
|
const wasPresent = before.stdout.trim() === "present";
|
||||||
|
|
||||||
|
await ctx.ssh.exec(REMOVE_LOGROTATE, sshOpts(ctx));
|
||||||
|
await ctx.ssh.exec(REAP_OLD_AUDIT_FILES, sshOpts(ctx));
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
changed,
|
changed: wasPresent,
|
||||||
message: changed ? "Log rotation configured" : "Log rotation already configured",
|
message: wasPresent
|
||||||
|
? "Removed legacy file-based audit logging (now via journald)"
|
||||||
|
: "No legacy audit log artifacts present",
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -71,9 +71,14 @@ describe("k3s install script — server role", () => {
|
|||||||
expect(script).toContain("enable-admission-plugins=NodeRestriction,PodSecurity");
|
expect(script).toContain("enable-admission-plugins=NodeRestriction,PodSecurity");
|
||||||
});
|
});
|
||||||
|
|
||||||
it("configures audit logging", () => {
|
it("configures audit logging via journald (stdout)", () => {
|
||||||
expect(script).toContain("audit-log-path=/var/log/kubernetes/audit.log");
|
expect(script).toContain("audit-log-path=-");
|
||||||
expect(script).toContain("audit-log-maxage=30");
|
// file-based fields and the now-obsolete log directory must be gone
|
||||||
|
expect(script).not.toContain("/var/log/kubernetes/audit.log");
|
||||||
|
expect(script).not.toContain("audit-log-maxage");
|
||||||
|
expect(script).not.toContain("audit-log-maxbackup");
|
||||||
|
expect(script).not.toContain("audit-log-maxsize");
|
||||||
|
expect(script).not.toContain("mkdir -p /var/log/kubernetes");
|
||||||
});
|
});
|
||||||
|
|
||||||
it("cleans stale flannel vxlan before Cilium install", () => {
|
it("cleans stale flannel vxlan before Cilium install", () => {
|
||||||
|
|||||||
@@ -348,3 +348,143 @@ describe("applyPodSecurityStandards", () => {
|
|||||||
expectCommand(ctx.ssh, "pod-security.kubernetes.io/audit=restricted");
|
expectCommand(ctx.ssh, "pod-security.kubernetes.io/audit=restricted");
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// --- Audit Logging Decommission (file-based → journald) ---
|
||||||
|
|
||||||
|
import { configureLogRotation } from "../src/operations/log-rotation.js";
|
||||||
|
import { configureJournaldLimits } from "../src/operations/journald-limits.js";
|
||||||
|
|
||||||
|
describe("configureLogRotation (decommission file-based audit logs)", () => {
|
||||||
|
it("removes the legacy logrotate rule and reaps obsolete audit files", async () => {
|
||||||
|
const ctx = mockCtx();
|
||||||
|
ctx.ssh.exec.mockResolvedValueOnce(stdout("present")); // probe: legacy artifacts exist
|
||||||
|
ctx.ssh.exec.mockResolvedValue(OK);
|
||||||
|
|
||||||
|
const result = await configureLogRotation(ctx);
|
||||||
|
|
||||||
|
expect(result.success).toBe(true);
|
||||||
|
expect(result.changed).toBe(true);
|
||||||
|
expectCommand(ctx.ssh, "rm -f /etc/logrotate.d/k3s");
|
||||||
|
expectCommand(ctx.ssh, /find \/var\/log\/kubernetes.*audit.*-delete/);
|
||||||
|
expectCommand(ctx.ssh, "rmdir /var/log/kubernetes");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("is a no-op when nothing legacy is present", async () => {
|
||||||
|
const ctx = mockCtx();
|
||||||
|
ctx.ssh.exec.mockResolvedValueOnce(stdout("absent"));
|
||||||
|
ctx.ssh.exec.mockResolvedValue(OK);
|
||||||
|
|
||||||
|
const result = await configureLogRotation(ctx);
|
||||||
|
expect(result.success).toBe(true);
|
||||||
|
expect(result.changed).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("configureJournaldLimits", () => {
|
||||||
|
it("writes a 2 GB SystemMaxUse drop-in and reloads journald when changed", async () => {
|
||||||
|
const ctx = mockCtx();
|
||||||
|
ctx.ssh.exec.mockResolvedValueOnce(stdout("__LABCTL_NOT_FOUND__")); // no existing drop-in
|
||||||
|
ctx.ssh.exec.mockResolvedValue(OK);
|
||||||
|
|
||||||
|
const result = await configureJournaldLimits(ctx);
|
||||||
|
|
||||||
|
expect(result.success).toBe(true);
|
||||||
|
expect(result.changed).toBe(true);
|
||||||
|
const writeCall = ctx.ssh.exec.mock.calls.find((c) => {
|
||||||
|
const cmd = c[0] as string;
|
||||||
|
return cmd.includes("10-k3s-audit-cap.conf") && cmd.includes("LABCTL_EOF");
|
||||||
|
});
|
||||||
|
expect(writeCall).toBeTruthy();
|
||||||
|
const written = writeCall?.[0] as string;
|
||||||
|
expect(written).toContain("SystemMaxUse=2G");
|
||||||
|
expect(written).toContain("SystemKeepFree=1G");
|
||||||
|
expectCommand(ctx.ssh, "systemctl restart systemd-journald");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("does not restart journald when the drop-in is already correct", async () => {
|
||||||
|
const ctx = mockCtx();
|
||||||
|
const existing =
|
||||||
|
"[Journal]\nSystemMaxUse=2G\nSystemKeepFree=1G\nSystemMaxFileSize=200M\n";
|
||||||
|
ctx.ssh.exec.mockResolvedValueOnce(stdout(existing));
|
||||||
|
ctx.ssh.exec.mockResolvedValue(OK);
|
||||||
|
|
||||||
|
const result = await configureJournaldLimits(ctx);
|
||||||
|
|
||||||
|
expect(result.success).toBe(true);
|
||||||
|
expect(result.changed).toBe(false);
|
||||||
|
expectNoCommand(ctx.ssh, "systemctl restart systemd-journald");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// --- Etcd Recovery ---
|
||||||
|
|
||||||
|
import { recoverEtcdMember } from "../src/operations/etcd-recover.js";
|
||||||
|
import { mockSsh } from "./helpers.js";
|
||||||
|
|
||||||
|
describe("recoverEtcdMember", () => {
|
||||||
|
it("refuses to operate when cluster is below 3 members (quorum risk)", async () => {
|
||||||
|
const broken = mockSsh();
|
||||||
|
const peer = mockSsh();
|
||||||
|
peer.exec.mockResolvedValueOnce(stdout("/usr/bin/etcdctl")); // etcdctl present
|
||||||
|
peer.exec.mockResolvedValueOnce(stdout(
|
||||||
|
"111, started, host-a-aaa, https://10.0.0.1:2380, https://10.0.0.1:2379, false\n" +
|
||||||
|
"222, started, host-b-bbb, https://10.0.0.2:2380, https://10.0.0.2:2379, false",
|
||||||
|
));
|
||||||
|
|
||||||
|
const result = await recoverEtcdMember({ broken, peer, brokenHostname: "host-b" });
|
||||||
|
|
||||||
|
expect(result.success).toBe(false);
|
||||||
|
expect(result.message).toMatch(/quorum/i);
|
||||||
|
// Critically: must NOT have stopped k3s or removed anything
|
||||||
|
expect(broken.exec).not.toHaveBeenCalledWith(expect.stringContaining("systemctl stop k3s"), expect.anything());
|
||||||
|
});
|
||||||
|
|
||||||
|
it("performs full procedure when quorum is preserved", async () => {
|
||||||
|
const broken = mockSsh();
|
||||||
|
const peer = mockSsh();
|
||||||
|
// ensureEtcdctl: present
|
||||||
|
peer.exec.mockResolvedValueOnce(stdout("/usr/bin/etcdctl"));
|
||||||
|
// member list (3 members, target = host-b)
|
||||||
|
peer.exec.mockResolvedValueOnce(stdout(
|
||||||
|
"111, started, host-a-aaa, https://10.0.0.1:2380, https://10.0.0.1:2379, false\n" +
|
||||||
|
"222, started, host-b-bbb, https://10.0.0.2:2380, https://10.0.0.2:2379, false\n" +
|
||||||
|
"333, started, host-c-ccc, https://10.0.0.3:2380, https://10.0.0.3:2379, false",
|
||||||
|
));
|
||||||
|
// member remove
|
||||||
|
peer.exec.mockResolvedValueOnce(stdout("Member 222 removed"));
|
||||||
|
// post-rejoin member list — new id 444 for host-b
|
||||||
|
peer.exec.mockResolvedValueOnce(stdout(
|
||||||
|
"111, started, host-a-aaa, https://10.0.0.1:2380, https://10.0.0.1:2379, false\n" +
|
||||||
|
"333, started, host-c-ccc, https://10.0.0.3:2380, https://10.0.0.3:2379, false\n" +
|
||||||
|
"444, started, host-b-zzz, https://10.0.0.2:2380, https://10.0.0.2:2379, false",
|
||||||
|
));
|
||||||
|
|
||||||
|
const result = await recoverEtcdMember({ broken, peer, brokenHostname: "host-b" });
|
||||||
|
|
||||||
|
expect(result.success).toBe(true);
|
||||||
|
expect(result.removedMemberId).toBe("222");
|
||||||
|
expect(result.newMemberId).toBe("444");
|
||||||
|
expectCommand(broken,"systemctl stop k3s");
|
||||||
|
expectCommand(peer,"member remove 222");
|
||||||
|
expectCommand(broken,/db\.corrupt-/);
|
||||||
|
expectCommand(broken,/rm -rf .*\/server\/tls/);
|
||||||
|
expectCommand(broken,"systemctl start k3s");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("fails clearly when no member matches the broken hostname", async () => {
|
||||||
|
const broken = mockSsh();
|
||||||
|
const peer = mockSsh();
|
||||||
|
peer.exec.mockResolvedValueOnce(stdout("/usr/bin/etcdctl"));
|
||||||
|
peer.exec.mockResolvedValueOnce(stdout(
|
||||||
|
"111, started, host-a-aaa, https://10.0.0.1:2380, https://10.0.0.1:2379, false\n" +
|
||||||
|
"222, started, host-b-bbb, https://10.0.0.2:2380, https://10.0.0.2:2379, false\n" +
|
||||||
|
"333, started, host-c-ccc, https://10.0.0.3:2380, https://10.0.0.3:2379, false",
|
||||||
|
));
|
||||||
|
|
||||||
|
const result = await recoverEtcdMember({ broken, peer, brokenHostname: "host-d" });
|
||||||
|
|
||||||
|
expect(result.success).toBe(false);
|
||||||
|
expect(result.message).toMatch(/No etcd member found/);
|
||||||
|
expect(broken.exec).not.toHaveBeenCalledWith(expect.stringContaining("systemctl stop k3s"), expect.anything());
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user