diff --git a/DAARION-INFRASTRUCTURE-STACK.md b/DAARION-INFRASTRUCTURE-STACK.md new file mode 100644 index 00000000..0b7e13f5 --- /dev/null +++ b/DAARION-INFRASTRUCTURE-STACK.md @@ -0,0 +1,993 @@ +# πŸ—οΈ DAARION Infrastructure Stack β€” Π”Π΅Ρ†Π΅Π½Ρ‚Ρ€Π°Π»Ρ–Π·ΠΎΠ²Π°Π½Π° ΠΌΠ΅Ρ€Π΅ΠΆΠ° + +**ВСрсія:** 1.0.0 +**Π”Π°Ρ‚Π°:** 2026-01-10 +**Бтатус:** Π’ процСсі впровадТСння + +--- + +## 🎯 ΠšΠΎΠ½Ρ†Π΅ΠΏΡ†Ρ–Ρ + +**Π”Π΅Ρ†Π΅Π½Ρ‚Ρ€Π°Π»Ρ–Π·ΠΎΠ²Π°Π½Π° ΠΌΠ΅Ρ€Π΅ΠΆΠ° власних Π΄Π°Ρ‚Π°Ρ†Π΅Π½Ρ‚Ρ€Ρ–Π² Ρ‚Π° Π½ΠΎΠ΄**, Ρ€ΠΎΠ·ΠΏΠΎΠ΄Ρ–Π»Π΅Π½ΠΈΡ… Π³Π΅ΠΎΠ³Ρ€Π°Ρ„Ρ–Ρ‡Π½ΠΎ: +- Π‘Π΅Π· залСТності Π²Ρ–Π΄ ΠΎΠ΄Π½ΠΎΠ³ΠΎ cloud-ΠΏΡ€ΠΎΠ²Π°ΠΉΠ΄Π΅Ρ€Π° +- Π“Ρ–Π±Ρ€ΠΈΠ΄Π½Π° інфраструктура (bare-metal + VM + K8s) +- Multi-DC Π°Ρ€Ρ…Ρ–Ρ‚Π΅ΠΊΡ‚ΡƒΡ€Π° Π· Consul для service discovery + +--- + +## πŸ“¦ Technology Stack + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ INFRASTRUCTURE LAYER β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ Terraform β”‚ Infrastructure as Code β”‚ +β”‚ (networks, VPC, β”‚ - ΠœΠ΅Ρ€Π΅ΠΆΡ–, VPC, firewall rules β”‚ +β”‚ LB, DNS, storage) β”‚ - Load Balancers, DNS records β”‚ +β”‚ β”‚ - Storage provisioning β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ CONFIGURATION LAYER β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ Ansible β”‚ Configuration Management β”‚ +β”‚ (OS bootstrap, β”‚ - SSH keys, users, packages β”‚ +β”‚ hardening, k3s) β”‚ - Security hardening β”‚ +β”‚ β”‚ - K3s/K8s cluster bootstrap β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ SECRETS LAYER β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ HashiCorp Vault β”‚ Centralized Secrets Management β”‚ +β”‚ + External Secrets β”‚ - Database credentials β”‚ +β”‚ Operator β”‚ - API keys, certificates β”‚ +β”‚ β”‚ - Dynamic secrets rotation β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ ORCHESTRATION LAYER β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ K3s / Kubernetes β”‚ Container Orchestration β”‚ +β”‚ + CoreDNS β”‚ - Lightweight K8s (k3s for edge) β”‚ +β”‚ β”‚ - Service discovery via CoreDNS β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ SERVICE DISCOVERY (Multi-DC) β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ Consul β”‚ Multi-DC Service Discovery β”‚ +β”‚ (for hybrid/ β”‚ - Cross-datacenter discovery β”‚ +β”‚ multi-DC) β”‚ - Health checking β”‚ +β”‚ β”‚ - Service mesh (optional) β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ OBSERVABILITY LAYER β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ Prometheus β”‚ Metrics collection & alerting β”‚ +β”‚ Grafana β”‚ Dashboards & visualization β”‚ +β”‚ Loki β”‚ Log aggregation β”‚ +β”‚ Tempo β”‚ Distributed tracing β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +--- + +## 🌍 ΠŸΠΎΡ‚ΠΎΡ‡Π½Π° ΠΌΠ΅Ρ€Π΅ΠΆΠ° + +| Node | Location | Type | Role | Status | +|------|----------|------|------|--------| +| **NODE1** | Hetzner DE | Dedicated | Master, Gateway | βœ… Active | +| **NODE2** | Local (Ivan) | MacBook M4 | Dev, Testing | βœ… Active | +| **NODE3** | Remote DC | Threadripper+RTX3090 | AI/ML, GPU | βœ… Active | +| **NODE4+** | TBD | Various | Compute | πŸ”œ Planned | + +--- + +## πŸ“ Repository Structure + +``` +infrastructure/ +β”œβ”€β”€ terraform/ +β”‚ β”œβ”€β”€ modules/ +β”‚ β”‚ β”œβ”€β”€ network/ # VPC, subnets, firewall +β”‚ β”‚ β”œβ”€β”€ compute/ # VMs, bare-metal provisioning +β”‚ β”‚ β”œβ”€β”€ dns/ # DNS records +β”‚ β”‚ β”œβ”€β”€ storage/ # Volumes, NFS, S3-compatible +β”‚ β”‚ └── load-balancer/ # HAProxy, Traefik configs +β”‚ β”œβ”€β”€ environments/ +β”‚ β”‚ β”œβ”€β”€ production/ +β”‚ β”‚ β”œβ”€β”€ staging/ +β”‚ β”‚ └── development/ +β”‚ └── main.tf +β”‚ +β”œβ”€β”€ ansible/ +β”‚ β”œβ”€β”€ inventory/ +β”‚ β”‚ β”œβ”€β”€ production.yml +β”‚ β”‚ β”œβ”€β”€ staging.yml +β”‚ β”‚ └── group_vars/ +β”‚ β”‚ β”œβ”€β”€ all.yml +β”‚ β”‚ β”œβ”€β”€ masters.yml +β”‚ β”‚ β”œβ”€β”€ workers.yml +β”‚ β”‚ └── gpu_nodes.yml +β”‚ β”œβ”€β”€ playbooks/ +β”‚ β”‚ β”œβ”€β”€ bootstrap.yml # OS setup, SSH, packages +β”‚ β”‚ β”œβ”€β”€ hardening.yml # Security hardening +β”‚ β”‚ β”œβ”€β”€ k3s-install.yml # K3s cluster setup +β”‚ β”‚ β”œβ”€β”€ vault-setup.yml # Vault installation +β”‚ β”‚ β”œβ”€β”€ observability.yml # Prometheus/Grafana/Loki +β”‚ β”‚ └── consul-setup.yml # Consul for multi-DC +β”‚ β”œβ”€β”€ roles/ +β”‚ β”‚ β”œβ”€β”€ common/ +β”‚ β”‚ β”œβ”€β”€ security/ +β”‚ β”‚ β”œβ”€β”€ docker/ +β”‚ β”‚ β”œβ”€β”€ k3s/ +β”‚ β”‚ β”œβ”€β”€ vault/ +β”‚ β”‚ β”œβ”€β”€ consul/ +β”‚ β”‚ └── observability/ +β”‚ └── ansible.cfg +β”‚ +β”œβ”€β”€ kubernetes/ +β”‚ β”œβ”€β”€ base/ +β”‚ β”‚ β”œβ”€β”€ namespaces/ +β”‚ β”‚ β”œβ”€β”€ rbac/ +β”‚ β”‚ └── network-policies/ +β”‚ β”œβ”€β”€ apps/ +β”‚ β”‚ β”œβ”€β”€ daarion-core/ +β”‚ β”‚ β”œβ”€β”€ postgres/ +β”‚ β”‚ β”œβ”€β”€ redis/ +β”‚ β”‚ └── monitoring/ +β”‚ β”œβ”€β”€ external-secrets/ +β”‚ β”‚ └── vault-backend.yml +β”‚ └── kustomization.yaml +β”‚ +β”œβ”€β”€ vault/ +β”‚ β”œβ”€β”€ policies/ +β”‚ β”œβ”€β”€ secrets-engines/ +β”‚ └── auth-methods/ +β”‚ +β”œβ”€β”€ consul/ +β”‚ β”œβ”€β”€ config/ +β”‚ └── services/ +β”‚ +└── observability/ + β”œβ”€β”€ prometheus/ + β”œβ”€β”€ grafana/ + β”œβ”€β”€ loki/ + └── tempo/ +``` + +--- + +## πŸš€ Phase 1: Π‘Π°Π·ΠΎΠ²Π° інфраструктура + +ΠŸΠΎΡ‡Π½Π΅ΠΌΠΎ Π· встановлСння Π±Π°Π·ΠΎΠ²ΠΎΠ³ΠΎ стСку Π½Π° NODE1 Ρ‚Π° NODE3. + +### 1.1 Ansible Inventory + +```yaml +# ansible/inventory/production.yml +all: + vars: + ansible_python_interpreter: /usr/bin/python3 + timezone: "UTC" + + children: + masters: + hosts: + node1: + ansible_host: 144.76.224.179 + ansible_user: root + node_role: master + datacenter: hetzner-de + + workers: + hosts: + node3: + ansible_host: 80.77.35.151 + ansible_port: 33147 + ansible_user: zevs + ansible_become: yes + ansible_become_pass: "{{ vault_node3_password }}" + node_role: worker + datacenter: remote-dc + gpu: true + gpu_type: "rtx3090" + + gpu_nodes: + hosts: + node3: + + local_dev: + hosts: + node2: + ansible_host: 192.168.1.244 + ansible_user: apple + node_role: development + datacenter: local +``` + +### 1.2 Bootstrap Playbook + +```yaml +# ansible/playbooks/bootstrap.yml +--- +- name: Bootstrap all nodes + hosts: all + become: yes + + vars: + common_packages: + - curl + - wget + - git + - htop + - vim + - jq + - unzip + - ca-certificates + - gnupg + - lsb-release + + tasks: + - name: Set timezone + timezone: + name: "{{ timezone }}" + + - name: Update apt cache + apt: + update_cache: yes + cache_valid_time: 3600 + when: ansible_os_family == "Debian" + + - name: Install common packages + apt: + name: "{{ common_packages }}" + state: present + when: ansible_os_family == "Debian" + + - name: Create admin group + group: + name: daarion-admin + state: present + + - name: Setup SSH authorized keys + authorized_key: + user: "{{ ansible_user }}" + key: "{{ lookup('file', '~/.ssh/daarion_network.pub') }}" + state: present + + - name: Disable password authentication + lineinfile: + path: /etc/ssh/sshd_config + regexp: '^#?PasswordAuthentication' + line: 'PasswordAuthentication no' + notify: restart sshd + + - name: Set hostname + hostname: + name: "{{ inventory_hostname }}" + + - name: Update /etc/hosts + lineinfile: + path: /etc/hosts + line: "{{ hostvars[item].ansible_host }} {{ item }}" + state: present + loop: "{{ groups['all'] }}" + when: hostvars[item].ansible_host is defined + + handlers: + - name: restart sshd + service: + name: sshd + state: restarted +``` + +### 1.3 Security Hardening Playbook + +```yaml +# ansible/playbooks/hardening.yml +--- +- name: Security Hardening + hosts: all + become: yes + + vars: + security_packages: + - fail2ban + - ufw + - auditd + - rkhunter + - unattended-upgrades + + allowed_ssh_port: "{{ ansible_port | default(22) }}" + + tasks: + - name: Install security packages + apt: + name: "{{ security_packages }}" + state: present + + - name: Install Trivy + shell: | + curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sh -s -- -b /usr/local/bin + args: + creates: /usr/local/bin/trivy + + # UFW Configuration + - name: UFW - Default deny incoming + ufw: + direction: incoming + policy: deny + + - name: UFW - Default deny outgoing + ufw: + direction: outgoing + policy: deny + + - name: UFW - Allow SSH + ufw: + rule: allow + port: "{{ allowed_ssh_port }}" + proto: tcp + + - name: UFW - Allow necessary outgoing + ufw: + rule: allow + direction: out + port: "{{ item.port }}" + proto: "{{ item.proto }}" + loop: + - { port: 53, proto: udp } # DNS + - { port: 80, proto: tcp } # HTTP + - { port: 443, proto: tcp } # HTTPS + - { port: 123, proto: udp } # NTP + + - name: UFW - Allow K3s ports (masters) + ufw: + rule: allow + port: "{{ item }}" + proto: tcp + loop: + - 6443 # K3s API + - 10250 # Kubelet + when: "'masters' in group_names" + + - name: UFW - Enable + ufw: + state: enabled + + # Fail2ban + - name: Configure fail2ban + template: + src: templates/jail.local.j2 + dest: /etc/fail2ban/jail.local + notify: restart fail2ban + + # Kernel hardening + - name: Kernel hardening sysctl + sysctl: + name: "{{ item.name }}" + value: "{{ item.value }}" + state: present + reload: yes + loop: + - { name: 'net.ipv4.ip_forward', value: '1' } # Required for K8s + - { name: 'net.ipv4.conf.all.accept_redirects', value: '0' } + - { name: 'net.ipv4.conf.default.accept_redirects', value: '0' } + - { name: 'net.ipv4.tcp_syncookies', value: '1' } + - { name: 'kernel.randomize_va_space', value: '2' } + + # Security check script + - name: Create scripts directory + file: + path: /opt/scripts + state: directory + mode: '0755' + + - name: Deploy security check script + copy: + src: files/security-check.sh + dest: /opt/scripts/security-check.sh + mode: '0755' + + - name: Setup security cron + cron: + name: "Hourly security check" + minute: "0" + job: "/opt/scripts/security-check.sh" + + handlers: + - name: restart fail2ban + service: + name: fail2ban + state: restarted +``` + +### 1.4 K3s Installation Playbook + +```yaml +# ansible/playbooks/k3s-install.yml +--- +- name: Install K3s on Masters + hosts: masters + become: yes + + vars: + k3s_version: "v1.29.0+k3s1" + + tasks: + - name: Download K3s installer + get_url: + url: https://get.k3s.io + dest: /tmp/k3s-install.sh + mode: '0755' + + - name: Install K3s server + shell: | + INSTALL_K3S_VERSION={{ k3s_version }} \ + K3S_TOKEN={{ k3s_token }} \ + sh /tmp/k3s-install.sh server \ + --disable traefik \ + --disable servicelb \ + --write-kubeconfig-mode 644 \ + --tls-san {{ ansible_host }} \ + --node-label "datacenter={{ datacenter }}" \ + --node-label "node-role={{ node_role }}" + args: + creates: /etc/rancher/k3s/k3s.yaml + + - name: Wait for K3s to be ready + wait_for: + port: 6443 + delay: 10 + timeout: 300 + + - name: Get K3s token + slurp: + src: /var/lib/rancher/k3s/server/node-token + register: k3s_token_file + + - name: Save K3s token + set_fact: + k3s_join_token: "{{ k3s_token_file.content | b64decode | trim }}" + + - name: Fetch kubeconfig + fetch: + src: /etc/rancher/k3s/k3s.yaml + dest: "{{ playbook_dir }}/../kubeconfig/{{ inventory_hostname }}.yaml" + flat: yes + +--- +- name: Install K3s on Workers + hosts: workers + become: yes + + vars: + k3s_version: "v1.29.0+k3s1" + k3s_master: "{{ hostvars[groups['masters'][0]].ansible_host }}" + + tasks: + - name: Download K3s installer + get_url: + url: https://get.k3s.io + dest: /tmp/k3s-install.sh + mode: '0755' + + - name: Install K3s agent + shell: | + INSTALL_K3S_VERSION={{ k3s_version }} \ + K3S_URL=https://{{ k3s_master }}:6443 \ + K3S_TOKEN={{ hostvars[groups['masters'][0]].k3s_join_token }} \ + sh /tmp/k3s-install.sh agent \ + --node-label "datacenter={{ datacenter }}" \ + --node-label "node-role={{ node_role }}" \ + {% if gpu is defined and gpu %} + --node-label "gpu=true" \ + --node-label "gpu-type={{ gpu_type }}" + {% endif %} + args: + creates: /etc/rancher/k3s/k3s.yaml +``` + +--- + +## πŸ” Phase 2: Vault Setup + +### 2.1 Vault Installation + +```yaml +# ansible/playbooks/vault-setup.yml +--- +- name: Install HashiCorp Vault + hosts: masters + become: yes + + vars: + vault_version: "1.15.4" + vault_data_dir: "/opt/vault/data" + + tasks: + - name: Create vault user + user: + name: vault + system: yes + shell: /bin/false + + - name: Create vault directories + file: + path: "{{ item }}" + state: directory + owner: vault + group: vault + mode: '0750' + loop: + - /opt/vault + - /opt/vault/data + - /opt/vault/config + - /opt/vault/logs + + - name: Download Vault + get_url: + url: "https://releases.hashicorp.com/vault/{{ vault_version }}/vault_{{ vault_version }}_linux_amd64.zip" + dest: /tmp/vault.zip + + - name: Extract Vault + unarchive: + src: /tmp/vault.zip + dest: /usr/local/bin + remote_src: yes + + - name: Vault configuration + template: + src: templates/vault.hcl.j2 + dest: /opt/vault/config/vault.hcl + owner: vault + group: vault + notify: restart vault + + - name: Vault systemd service + template: + src: templates/vault.service.j2 + dest: /etc/systemd/system/vault.service + notify: + - reload systemd + - restart vault + + - name: Enable and start Vault + service: + name: vault + enabled: yes + state: started + + handlers: + - name: reload systemd + systemd: + daemon_reload: yes + + - name: restart vault + service: + name: vault + state: restarted +``` + +### 2.2 Vault Configuration + +```hcl +# ansible/templates/vault.hcl.j2 +ui = true + +storage "file" { + path = "/opt/vault/data" +} + +listener "tcp" { + address = "0.0.0.0:8200" + tls_disable = "true" # Enable TLS in production! +} + +api_addr = "http://{{ ansible_host }}:8200" +cluster_addr = "https://{{ ansible_host }}:8201" +``` + +### 2.3 External Secrets Operator + +```yaml +# kubernetes/external-secrets/vault-backend.yml +apiVersion: external-secrets.io/v1beta1 +kind: ClusterSecretStore +metadata: + name: vault-backend +spec: + provider: + vault: + server: "http://node1:8200" + path: "secret" + version: "v2" + auth: + kubernetes: + mountPath: "kubernetes" + role: "external-secrets" + serviceAccountRef: + name: "external-secrets" + namespace: "external-secrets" + +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: postgres-credentials + namespace: daarion +spec: + refreshInterval: "1h" + secretStoreRef: + name: vault-backend + kind: ClusterSecretStore + target: + name: postgres-credentials + creationPolicy: Owner + data: + - secretKey: username + remoteRef: + key: secret/data/postgres + property: username + - secretKey: password + remoteRef: + key: secret/data/postgres + property: password +``` + +--- + +## πŸ” Phase 3: Consul (Multi-DC) + +### 3.1 Consul Installation + +```yaml +# ansible/playbooks/consul-setup.yml +--- +- name: Install Consul + hosts: all + become: yes + + vars: + consul_version: "1.17.1" + consul_datacenter: "{{ datacenter }}" + consul_is_server: "{{ 'masters' in group_names }}" + + tasks: + - name: Create consul user + user: + name: consul + system: yes + shell: /bin/false + + - name: Create consul directories + file: + path: "{{ item }}" + state: directory + owner: consul + group: consul + loop: + - /opt/consul + - /opt/consul/data + - /opt/consul/config + + - name: Download Consul + get_url: + url: "https://releases.hashicorp.com/consul/{{ consul_version }}/consul_{{ consul_version }}_linux_amd64.zip" + dest: /tmp/consul.zip + + - name: Extract Consul + unarchive: + src: /tmp/consul.zip + dest: /usr/local/bin + remote_src: yes + + - name: Consul configuration + template: + src: templates/consul.hcl.j2 + dest: /opt/consul/config/consul.hcl + owner: consul + group: consul + notify: restart consul + + - name: Consul systemd service + template: + src: templates/consul.service.j2 + dest: /etc/systemd/system/consul.service + notify: + - reload systemd + - restart consul + + - name: Enable and start Consul + service: + name: consul + enabled: yes + state: started + + handlers: + - name: reload systemd + systemd: + daemon_reload: yes + + - name: restart consul + service: + name: consul + state: restarted +``` + +### 3.2 Consul Configuration + +```hcl +# ansible/templates/consul.hcl.j2 +datacenter = "{{ consul_datacenter }}" +data_dir = "/opt/consul/data" +log_level = "INFO" +node_name = "{{ inventory_hostname }}" +bind_addr = "{{ ansible_host }}" +client_addr = "0.0.0.0" + +{% if consul_is_server %} +server = true +bootstrap_expect = {{ groups['masters'] | length }} +ui_config { + enabled = true +} +{% endif %} + +# Join other servers +retry_join = [ +{% for host in groups['masters'] %} + "{{ hostvars[host].ansible_host }}"{% if not loop.last %},{% endif %} + +{% endfor %} +] + +# WAN federation for multi-DC +{% if groups['masters'] | length > 1 %} +retry_join_wan = [ +{% for host in groups['masters'] %} + "{{ hostvars[host].ansible_host }}"{% if not loop.last %},{% endif %} + +{% endfor %} +] +{% endif %} + +# Service mesh +connect { + enabled = true +} + +# DNS +ports { + dns = 8600 +} + +# ACL (enable in production) +acl { + enabled = false + default_policy = "allow" +} +``` + +--- + +## πŸ“Š Phase 4: Observability Stack + +### 4.1 Prometheus + Grafana + Loki + Tempo + +```yaml +# ansible/playbooks/observability.yml +--- +- name: Deploy Observability Stack + hosts: masters + become: yes + + tasks: + - name: Create monitoring namespace + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: Namespace + metadata: + name: monitoring + + - name: Add Prometheus Helm repo + kubernetes.core.helm_repository: + name: prometheus-community + repo_url: https://prometheus-community.github.io/helm-charts + + - name: Add Grafana Helm repo + kubernetes.core.helm_repository: + name: grafana + repo_url: https://grafana.github.io/helm-charts + + - name: Install kube-prometheus-stack + kubernetes.core.helm: + name: prometheus + chart_ref: prometheus-community/kube-prometheus-stack + release_namespace: monitoring + create_namespace: yes + values: + prometheus: + prometheusSpec: + retention: 30d + storageSpec: + volumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 50Gi + grafana: + adminPassword: "{{ vault_grafana_password }}" + persistence: + enabled: true + size: 10Gi + + - name: Install Loki + kubernetes.core.helm: + name: loki + chart_ref: grafana/loki-stack + release_namespace: monitoring + values: + loki: + persistence: + enabled: true + size: 50Gi + promtail: + enabled: true + + - name: Install Tempo + kubernetes.core.helm: + name: tempo + chart_ref: grafana/tempo + release_namespace: monitoring + values: + tempo: + retention: 168h # 7 days +``` + +### 4.2 Grafana Dashboards + +```yaml +# kubernetes/apps/monitoring/grafana-dashboards.yml +apiVersion: v1 +kind: ConfigMap +metadata: + name: daarion-dashboards + namespace: monitoring + labels: + grafana_dashboard: "1" +data: + daarion-network.json: | + { + "dashboard": { + "title": "DAARION Network Overview", + "panels": [ + { + "title": "Total Nodes", + "type": "stat", + "targets": [{"expr": "count(up{job=\"node-exporter\"})"}] + }, + { + "title": "Nodes by Datacenter", + "type": "piechart", + "targets": [{"expr": "count by (datacenter) (up{job=\"node-exporter\"})"}] + }, + { + "title": "GPU Nodes", + "type": "stat", + "targets": [{"expr": "count(up{job=\"node-exporter\", gpu=\"true\"})"}] + }, + { + "title": "K3s Cluster Status", + "type": "stat", + "targets": [{"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})"}] + } + ] + } + } +``` + +--- + +## πŸš€ Quick Start + +### ΠšΡ€ΠΎΠΊ 1: ΠŸΡ–Π΄Π³ΠΎΡ‚ΠΎΠ²ΠΊΠ° + +```bash +# ΠšΠ»ΠΎΠ½ΡƒΠ²Π°Ρ‚ΠΈ Ρ€Π΅ΠΏΠΎΠ·ΠΈΡ‚ΠΎΡ€Ρ–ΠΉ +git clone git@github.com:IvanTytar/microdao-daarion.git +cd microdao-daarion/infrastructure + +# Π‘Ρ‚Π²ΠΎΡ€ΠΈΡ‚ΠΈ SSH ΠΊΠ»ΡŽΡ‡ для ΠΌΠ΅Ρ€Π΅ΠΆΡ– +ssh-keygen -t ed25519 -f ~/.ssh/daarion_network -C "daarion-network" + +# Встановити Ansible +pip install ansible ansible-lint + +# Встановити Terraform +brew install terraform # macOS +``` + +### ΠšΡ€ΠΎΠΊ 2: ΠΠ°Π»Π°ΡˆΡ‚ΡƒΠ²Π°Π½Π½Ρ inventory + +```bash +# Π‘ΠΊΠΎΠΏΡ–ΡŽΠ²Π°Ρ‚ΠΈ ΠΏΡ€ΠΈΠΊΠ»Π°Π΄ +cp ansible/inventory/example.yml ansible/inventory/production.yml + +# Π’Ρ–Π΄Ρ€Π΅Π΄Π°Π³ΡƒΠ²Π°Ρ‚ΠΈ ΠΏΡ–Π΄ свої Π½ΠΎΠ΄ΠΈ +vim ansible/inventory/production.yml +``` + +### ΠšΡ€ΠΎΠΊ 3: Bootstrap Π½ΠΎΠ΄ + +```bash +cd ansible + +# ΠŸΠ΅Ρ€Π΅Π²Ρ–Ρ€ΠΈΡ‚ΠΈ Π·'єднання +ansible all -i inventory/production.yml -m ping + +# Bootstrap +ansible-playbook -i inventory/production.yml playbooks/bootstrap.yml + +# Hardening +ansible-playbook -i inventory/production.yml playbooks/hardening.yml +``` + +### ΠšΡ€ΠΎΠΊ 4: K3s кластСр + +```bash +# Встановити K3s +ansible-playbook -i inventory/production.yml playbooks/k3s-install.yml + +# ΠŸΠ΅Ρ€Π΅Π²Ρ–Ρ€ΠΈΡ‚ΠΈ +export KUBECONFIG=kubeconfig/node1.yaml +kubectl get nodes +``` + +### ΠšΡ€ΠΎΠΊ 5: Vault + Consul + +```bash +# Vault +ansible-playbook -i inventory/production.yml playbooks/vault-setup.yml + +# Consul (якщо multi-DC) +ansible-playbook -i inventory/production.yml playbooks/consul-setup.yml +``` + +### ΠšΡ€ΠΎΠΊ 6: Observability + +```bash +# Prometheus + Grafana + Loki + Tempo +ansible-playbook -i inventory/production.yml playbooks/observability.yml +``` + +--- + +## πŸ“‹ Checklist + +### Phase 1: Foundation +- [x] NODE1 security hardening +- [x] NODE3 security hardening +- [x] PostgreSQL on NODE1 & NODE3 +- [ ] Ansible repository structure +- [ ] SSH key distribution +- [ ] Bootstrap playbook tested + +### Phase 2: K3s Cluster +- [ ] K3s on NODE1 (master) +- [ ] K3s on NODE3 (worker + GPU) +- [ ] CoreDNS configured +- [ ] Network policies + +### Phase 3: Secrets & Discovery +- [ ] Vault installed +- [ ] External Secrets Operator +- [ ] Consul (if needed for multi-DC) + +### Phase 4: Observability +- [ ] Prometheus +- [ ] Grafana +- [ ] Loki +- [ ] Tempo +- [ ] Alerting rules + +--- + +**Автор:** Ivan Tytar & AI Assistant +**ΠžΡΡ‚Π°Π½Π½Ρ” оновлСння:** 2026-01-10 diff --git a/NETWORK-150-NODES-PLAN.md b/NETWORK-150-NODES-PLAN.md deleted file mode 100644 index 0bd1ec15..00000000 --- a/NETWORK-150-NODES-PLAN.md +++ /dev/null @@ -1,633 +0,0 @@ -# 🌐 План розгортання ΠΌΠ΅Ρ€Π΅ΠΆΡ– 150 Π½ΠΎΠ΄ β€” DAARION Network - -**ВСрсія:** 1.0.0 -**Π”Π°Ρ‚Π°:** 2026-01-10 -**Бтатус:** ΠŸΠ»Π°Π½ΡƒΠ²Π°Π½Π½Ρ - ---- - -## πŸ“‹ Зміст - -1. [АрхітСктура ΠΌΠ΅Ρ€Π΅ΠΆΡ–](#Π°Ρ€Ρ…Ρ–Ρ‚Π΅ΠΊΡ‚ΡƒΡ€Π°-ΠΌΠ΅Ρ€Π΅ΠΆΡ–) -2. [Π¦Π΅Π½Ρ‚Ρ€Π°Π»Ρ–Π·ΠΎΠ²Π°Π½Π΅ управління](#Ρ†Π΅Π½Ρ‚Ρ€Π°Π»Ρ–Π·ΠΎΠ²Π°Π½Π΅-управління) -3. [Автоматизація розгортання](#автоматизація-розгортання) -4. [Π‘Π΅Π·ΠΏΠ΅ΠΊΠ° ΠΌΠ΅Ρ€Π΅ΠΆΡ–](#Π±Π΅Π·ΠΏΠ΅ΠΊΠ°-ΠΌΠ΅Ρ€Π΅ΠΆΡ–) -5. [ΠœΠΎΠ½Ρ–Ρ‚ΠΎΡ€ΠΈΠ½Π³ Ρ‚Π° Π°Π»Π΅Ρ€Ρ‚ΠΈ](#ΠΌΠΎΠ½Ρ–Ρ‚ΠΎΡ€ΠΈΠ½Π³-Ρ‚Π°-Π°Π»Π΅Ρ€Ρ‚ΠΈ) -6. [Roadmap](#roadmap) - ---- - -## πŸ—οΈ АрхітСктура ΠΌΠ΅Ρ€Π΅ΠΆΡ– - -### Ієрархія Π½ΠΎΠ΄ - -``` - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ MASTER NODE β”‚ - β”‚ (NODE1) β”‚ - β”‚ Hetzner β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ β”‚ β”‚ - β”Œβ”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β” - β”‚ REGION EU β”‚ β”‚ REGION US β”‚ β”‚ REGION ASIA β”‚ - β”‚ Controller β”‚ β”‚ Controller β”‚ β”‚ Controller β”‚ - β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”˜ - β”‚ β”‚ β”‚ - β”Œβ”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”Όβ”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β” - β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ - 50 50 50 25 25 25 25 25 25 - nodes nodes nodes nodes nodes nodes nodes nodes nodes -``` - -### Π’ΠΈΠΏΠΈ Π½ΠΎΠ΄ - -| Π’ΠΈΠΏ | ΠšΡ–Π»ΡŒΠΊΡ–ΡΡ‚ΡŒ | Роль | РСсурси | -|-----|-----------|------|---------| -| **Master** | 1 | Π¦Π΅Π½Ρ‚Ρ€Π°Π»ΡŒΠ½Π΅ управління, GitOps | 8 CPU, 32GB RAM | -| **Region Controller** | 3-5 | Π Π΅Π³Ρ–ΠΎΠ½Π°Π»ΡŒΠ½Π΅ управління | 4 CPU, 16GB RAM | -| **Compute Node** | ~140 | ΠžΠ±Ρ‡ΠΈΡΠ»Π΅Π½Π½Ρ, AI workloads | 2-8 CPU, 8-64GB RAM | -| **GPU Node** | ~5 | AI/ML inference | GPU + 32GB+ RAM | - ---- - -## πŸŽ›οΈ Π¦Π΅Π½Ρ‚Ρ€Π°Π»Ρ–Π·ΠΎΠ²Π°Π½Π΅ управління - -### ІнструмСнти - -| ІнструмСнт | ΠŸΡ€ΠΈΠ·Π½Π°Ρ‡Π΅Π½Π½Ρ | ΠΠ»ΡŒΡ‚Π΅Ρ€Π½Π°Ρ‚ΠΈΠ²Π° | -|------------|-------------|--------------| -| **Ansible** | Configuration Management | Salt, Puppet | -| **Terraform** | Infrastructure as Code | Pulumi | -| **Kubernetes** | Container Orchestration | Docker Swarm | -| **Consul** | Service Discovery | etcd | -| **Vault** | Secrets Management | AWS Secrets Manager | -| **Prometheus** | Metrics | InfluxDB | -| **Grafana** | Dashboards | - | -| **Loki** | Logs | ELK Stack | - -### Ansible Inventory Structure - -```yaml -# inventory/production.yml -all: - children: - masters: - hosts: - node1-master: - ansible_host: 144.76.224.179 - ansible_user: root - - region_controllers: - hosts: - node3-eu: - ansible_host: 80.77.35.151 - ansible_port: 33147 - ansible_user: zevs - ansible_become_pass: "{{ vault_node3_password }}" - - compute_nodes: - children: - eu_nodes: - hosts: - node-eu-[001:050]: - ansible_host: "{{ inventory_hostname }}.daarion.network" - us_nodes: - hosts: - node-us-[001:050]: - ansible_host: "{{ inventory_hostname }}.daarion.network" - asia_nodes: - hosts: - node-asia-[001:050]: - ansible_host: "{{ inventory_hostname }}.daarion.network" - - gpu_nodes: - hosts: - gpu-[01:05]: - ansible_host: "{{ inventory_hostname }}.daarion.network" -``` - -### Ansible Playbook: Security Setup - -```yaml -# playbooks/security-setup.yml ---- -- name: Security Setup for All Nodes - hosts: all - become: yes - - vars: - security_packages: - - fail2ban - - auditd - - rkhunter - - chkrootkit - - ufw - - tasks: - - name: Update apt cache - apt: - update_cache: yes - cache_valid_time: 3600 - - - name: Install security packages - apt: - name: "{{ security_packages }}" - state: present - - - name: Install Trivy - shell: | - curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sh -s -- -b /usr/local/bin - args: - creates: /usr/local/bin/trivy - - - name: Configure fail2ban - template: - src: templates/jail.local.j2 - dest: /etc/fail2ban/jail.local - notify: restart fail2ban - - - name: Configure UFW defaults - ufw: - direction: "{{ item.direction }}" - policy: "{{ item.policy }}" - loop: - - { direction: incoming, policy: deny } - - { direction: outgoing, policy: deny } - - - name: Allow SSH - ufw: - rule: allow - port: "{{ ansible_port | default(22) }}" - proto: tcp - - - name: Allow necessary outgoing - ufw: - rule: allow - direction: out - port: "{{ item }}" - proto: "{{ item.proto | default('tcp') }}" - loop: - - { port: 53, proto: udp } - - { port: 80 } - - { port: 443 } - - { port: 123, proto: udp } - - - name: Block internal networks - ufw: - rule: deny - direction: out - to_ip: "{{ item }}" - loop: - - 10.0.0.0/8 - - 172.16.0.0/12 - - - name: Enable UFW - ufw: - state: enabled - - - name: Copy security check script - copy: - src: files/security-check.sh - dest: /opt/scripts/security-check.sh - mode: '0755' - - - name: Setup security cron - cron: - name: "Security check" - minute: "0" - job: "/opt/scripts/security-check.sh" - - handlers: - - name: restart fail2ban - service: - name: fail2ban - state: restarted -``` - -### Ansible Playbook: PostgreSQL Deployment - -```yaml -# playbooks/postgresql-deploy.yml ---- -- name: Deploy PostgreSQL to Nodes - hosts: database_nodes - become: yes - - vars: - postgres_image: "postgres@sha256:23e88eb049fd5d54894d70100df61d38a49ed97909263f79d4ff4c30a5d5fca2" - postgres_user: "daarion" - postgres_password: "{{ vault_postgres_password }}" - postgres_db: "daarion_main" - - tasks: - - name: Pull PostgreSQL image - docker_image: - name: "{{ postgres_image }}" - source: pull - - - name: Scan image with Trivy - command: trivy image --severity HIGH,CRITICAL --exit-code 1 {{ postgres_image }} - register: trivy_result - failed_when: trivy_result.rc != 0 - - - name: Create PostgreSQL volume - docker_volume: - name: "postgres_data_{{ inventory_hostname }}" - - - name: Run PostgreSQL container - docker_container: - name: dagi-postgres - image: "{{ postgres_image }}" - state: started - restart_policy: "no" - security_opts: - - no-new-privileges:true - read_only: yes - tmpfs: - - /tmp:noexec,nosuid,nodev,size=100m - - /var/run/postgresql:noexec,nosuid,nodev,size=10m - volumes: - - "postgres_data_{{ inventory_hostname }}:/var/lib/postgresql/data" - env: - POSTGRES_USER: "{{ postgres_user }}" - POSTGRES_PASSWORD: "{{ postgres_password }}" - POSTGRES_DB: "{{ postgres_db }}" - cpus: 2 - memory: 2g - ports: - - "5432:5432" - - - name: Wait for PostgreSQL to be ready - wait_for: - host: localhost - port: 5432 - delay: 5 - timeout: 60 -``` - ---- - -## πŸš€ Автоматизація розгортання - -### GitOps Workflow - -``` -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ GitHub │────▢│ ArgoCD │────▢│ Kubernetes β”‚ -β”‚ (configs) β”‚ β”‚ (GitOps) β”‚ β”‚ (runtime) β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ β”‚ β”‚ - β”‚ β”‚ β”‚ - β–Ό β–Ό β–Ό -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ Terraform │────▢│ Ansible │────▢│ Nodes β”‚ -β”‚ (infra) β”‚ β”‚ (config) β”‚ β”‚ (150) β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ -``` - -### Terraform: Node Provisioning - -```hcl -# terraform/main.tf -terraform { - required_providers { - hcloud = { - source = "hetznercloud/hcloud" - } - } -} - -variable "hcloud_token" { - sensitive = true -} - -variable "node_count" { - default = 50 -} - -provider "hcloud" { - token = var.hcloud_token -} - -resource "hcloud_ssh_key" "default" { - name = "daarion-network" - public_key = file("~/.ssh/daarion_network.pub") -} - -resource "hcloud_server" "compute_nodes" { - count = var.node_count - name = "node-eu-${format("%03d", count.index + 1)}" - server_type = "cx31" # 2 vCPU, 8GB RAM - image = "ubuntu-24.04" - location = "nbg1" - ssh_keys = [hcloud_ssh_key.default.id] - - labels = { - role = "compute" - region = "eu" - managed = "terraform" - } - - user_data = <<-EOF - #cloud-config - packages: - - docker.io - - fail2ban - - ufw - runcmd: - - systemctl enable docker - - systemctl start docker - - ufw default deny incoming - - ufw default deny outgoing - - ufw allow 22/tcp - - ufw allow out 53/udp - - ufw allow out 443/tcp - - ufw --force enable - EOF -} - -output "node_ips" { - value = hcloud_server.compute_nodes[*].ipv4_address -} -``` - -### Deployment Script - -```bash -#!/bin/bash -# scripts/deploy-network.sh - -set -e - -NODES_COUNT=${1:-10} -REGION=${2:-eu} - -echo "πŸš€ Deploying $NODES_COUNT nodes in $REGION region..." - -# 1. Provision infrastructure -echo "[1/5] Provisioning infrastructure..." -cd terraform -terraform init -terraform apply -var="node_count=$NODES_COUNT" -auto-approve -cd .. - -# 2. Wait for nodes to be ready -echo "[2/5] Waiting for nodes..." -sleep 60 - -# 3. Update Ansible inventory -echo "[3/5] Updating inventory..." -terraform output -json node_ips | jq -r '.[]' > inventory/hosts_$REGION.txt - -# 4. Run security setup -echo "[4/5] Running security setup..." -ansible-playbook -i inventory/production.yml playbooks/security-setup.yml --limit "$REGION_nodes" - -# 5. Deploy services -echo "[5/5] Deploying services..." -ansible-playbook -i inventory/production.yml playbooks/services-deploy.yml --limit "$REGION_nodes" - -echo "βœ… Deployment complete!" -``` - ---- - -## πŸ”’ Π‘Π΅Π·ΠΏΠ΅ΠΊΠ° ΠΌΠ΅Ρ€Π΅ΠΆΡ– - -### Zero Trust Architecture - -``` -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ ZERO TRUST LAYER β”‚ -β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ -β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ -β”‚ β”‚ mTLS β”‚ β”‚ RBAC β”‚ β”‚ Network β”‚ β”‚ Secrets β”‚ β”‚ -β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ Policy β”‚ β”‚ Vault β”‚ β”‚ -β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ -β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ -β”‚ SERVICE MESH (Istio) β”‚ -β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ -β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ -β”‚ β”‚ Node 1 β”‚ β”‚ Node 2 β”‚ β”‚ Node 3 β”‚ β”‚ Node N β”‚ β”‚ -β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ -``` - -### Security Policies - -```yaml -# k8s/network-policy.yml -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - name: default-deny-all -spec: - podSelector: {} - policyTypes: - - Ingress - - Egress - ---- -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - name: allow-postgres -spec: - podSelector: - matchLabels: - app: postgres - ingress: - - from: - - podSelector: - matchLabels: - access: postgres - ports: - - protocol: TCP - port: 5432 -``` - -### Vault Integration - -```yaml -# vault/postgres-policy.hcl -path "database/creds/daarion-db" { - capabilities = ["read"] -} - -path "secret/data/postgres/*" { - capabilities = ["read"] -} -``` - -```bash -# ΠžΡ‚Ρ€ΠΈΠΌΠ°Π½Π½Ρ credentials -vault read database/creds/daarion-db -``` - ---- - -## πŸ“Š ΠœΠΎΠ½Ρ–Ρ‚ΠΎΡ€ΠΈΠ½Π³ Ρ‚Π° Π°Π»Π΅Ρ€Ρ‚ΠΈ - -### Prometheus Federation - -```yaml -# prometheus/federation.yml -global: - scrape_interval: 15s - evaluation_interval: 15s - -scrape_configs: - - job_name: 'federate' - scrape_interval: 30s - honor_labels: true - metrics_path: '/federate' - params: - 'match[]': - - '{job="node"}' - - '{job="docker"}' - - '{job="postgres"}' - static_configs: - - targets: - - 'node-eu-001:9090' - - 'node-eu-002:9090' - # ... all nodes -``` - -### Grafana Dashboard - -```json -{ - "dashboard": { - "title": "DAARION Network Overview", - "panels": [ - { - "title": "Total Nodes", - "type": "stat", - "targets": [ - { - "expr": "count(up{job=\"node\"})" - } - ] - }, - { - "title": "Healthy Nodes", - "type": "stat", - "targets": [ - { - "expr": "count(up{job=\"node\"} == 1)" - } - ] - }, - { - "title": "Security Alerts", - "type": "stat", - "targets": [ - { - "expr": "sum(security_alerts_total)" - } - ] - } - ] - } -} -``` - -### Alert Rules - -```yaml -# prometheus/alerts.yml -groups: - - name: network - rules: - - alert: NodeDown - expr: up{job="node"} == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "Node {{ $labels.instance }} is down" - - - alert: HighCPU - expr: node_cpu_seconds_total{mode="idle"} < 20 - for: 10m - labels: - severity: warning - - - alert: SuspiciousProcess - expr: security_suspicious_process > 0 - for: 1m - labels: - severity: critical - annotations: - summary: "Suspicious process on {{ $labels.instance }}" - - - alert: PostgresDown - expr: pg_up == 0 - for: 1m - labels: - severity: critical -``` - ---- - -## πŸ“… Roadmap - -### Phase 1: Foundation (ВиТдСнь 1-2) -- [x] NODE1 rebuild + security -- [x] NODE3 setup + security -- [x] PostgreSQL Π½Π° NODE1 Ρ‚Π° NODE3 -- [ ] Ansible repository setup -- [ ] Terraform configs -- [ ] CI/CD pipeline - -### Phase 2: Regional Controllers (ВиТдСнь 3-4) -- [ ] Deploy 3 region controllers -- [ ] Consul cluster setup -- [ ] Vault setup -- [ ] Prometheus federation - -### Phase 3: First 50 Nodes (ВиТдСнь 5-8) -- [ ] EU region: 50 nodes -- [ ] Automated deployment testing -- [ ] Security audit -- [ ] Performance testing - -### Phase 4: Scale to 150 (ВиТдСнь 9-12) -- [ ] US region: 50 nodes -- [ ] Asia region: 50 nodes -- [ ] Global monitoring -- [ ] Disaster recovery testing - -### Phase 5: Production (ВиТдСнь 13+) -- [ ] Full production workloads -- [ ] 24/7 monitoring -- [ ] Automated incident response -- [ ] Continuous security audits - ---- - -## πŸ’° Estimated Costs - -| Resource | Per Node | 50 Nodes | 150 Nodes | -|----------|----------|----------|-----------| -| Hetzner CX31 | €10/mo | €500/mo | €1,500/mo | -| Storage (100GB) | €5/mo | €250/mo | €750/mo | -| Bandwidth | ~€5/mo | €250/mo | €750/mo | -| **Total** | **€20/mo** | **€1,000/mo** | **€3,000/mo** | - ---- - -## πŸ“š Π”ΠΎΠ΄Π°Ρ‚ΠΊΠΎΠ²Ρ– рСсурси - -- [Ansible Best Practices](https://docs.ansible.com/ansible/latest/user_guide/playbooks_best_practices.html) -- [Terraform Hetzner Provider](https://registry.terraform.io/providers/hetznercloud/hcloud/latest/docs) -- [Kubernetes Network Policies](https://kubernetes.io/docs/concepts/services-networking/network-policies/) -- [HashiCorp Vault](https://www.vaultproject.io/docs) -- [Prometheus Federation](https://prometheus.io/docs/prometheus/latest/federation/) - ---- - -**Автор:** Ivan Tytar & AI Assistant -**ΠžΡΡ‚Π°Π½Π½Ρ” оновлСння:** 2026-01-10 diff --git a/infrastructure/ansible/.vault_pass.example b/infrastructure/ansible/.vault_pass.example new file mode 100644 index 00000000..c8b56d0d --- /dev/null +++ b/infrastructure/ansible/.vault_pass.example @@ -0,0 +1 @@ +# Create .vault_pass file with your vault password diff --git a/infrastructure/ansible/ansible.cfg b/infrastructure/ansible/ansible.cfg new file mode 100644 index 00000000..23c2d809 --- /dev/null +++ b/infrastructure/ansible/ansible.cfg @@ -0,0 +1,31 @@ +# DAARION Network - Ansible Configuration +[defaults] +inventory = inventory/production.yml +remote_user = root +host_key_checking = False +retry_files_enabled = False +gathering = smart +fact_caching = jsonfile +fact_caching_connection = /tmp/ansible_facts +fact_caching_timeout = 86400 + +# Parallelism +forks = 20 + +# Output +stdout_callback = yaml +callback_whitelist = profile_tasks + +# Vault +vault_password_file = .vault_pass + +[ssh_connection] +pipelining = True +control_path = /tmp/ansible-%%h-%%p-%%r +ssh_args = -o ControlMaster=auto -o ControlPersist=60s -o StrictHostKeyChecking=no + +[privilege_escalation] +become = True +become_method = sudo +become_user = root +become_ask_pass = False diff --git a/infrastructure/ansible/inventory/group_vars/all.yml b/infrastructure/ansible/inventory/group_vars/all.yml new file mode 100644 index 00000000..0b9f2447 --- /dev/null +++ b/infrastructure/ansible/inventory/group_vars/all.yml @@ -0,0 +1,93 @@ +# DAARION Network - Global Variables +# These variables apply to all hosts + +# ============================================================================= +# SECURITY +# ============================================================================= +security_packages: + - fail2ban + - ufw + - auditd + - rkhunter + - unattended-upgrades + - ca-certificates + +# Firewall - allowed ports (in addition to SSH) +firewall_allowed_tcp_ports: + - 6443 # K3s API + - 10250 # Kubelet + - 8200 # Vault + - 8500 # Consul HTTP + - 8600 # Consul DNS + - 9090 # Prometheus + - 3000 # Grafana + - 5432 # PostgreSQL + +firewall_allowed_outgoing: + - { port: 53, proto: udp } # DNS + - { port: 80, proto: tcp } # HTTP + - { port: 443, proto: tcp } # HTTPS + - { port: 123, proto: udp } # NTP + +# Blocked networks (internal/private) +firewall_blocked_networks: + - 10.0.0.0/8 + - 172.16.0.0/12 + +# ============================================================================= +# DOCKER +# ============================================================================= +docker_users: + - "{{ ansible_user }}" + +docker_daemon_options: + storage-driver: "overlay2" + log-driver: "json-file" + log-opts: + max-size: "100m" + max-file: "3" + +# ============================================================================= +# K3S / KUBERNETES +# ============================================================================= +k3s_version: "v1.29.0+k3s1" +k3s_disable: + - traefik + - servicelb + +# ============================================================================= +# VAULT +# ============================================================================= +vault_version: "1.15.4" +vault_addr: "http://node1:8200" +vault_data_dir: "/opt/vault/data" + +# ============================================================================= +# CONSUL +# ============================================================================= +consul_version: "1.17.1" +consul_data_dir: "/opt/consul/data" +consul_enable_connect: true + +# ============================================================================= +# OBSERVABILITY +# ============================================================================= +prometheus_retention: "30d" +prometheus_storage_size: "50Gi" +loki_retention: "168h" # 7 days +tempo_retention: "168h" # 7 days + +# ============================================================================= +# POSTGRESQL +# ============================================================================= +postgres_image: "postgres@sha256:23e88eb049fd5d54894d70100df61d38a49ed97909263f79d4ff4c30a5d5fca2" +postgres_user: "daarion" +postgres_db: "daarion_main" + +# ============================================================================= +# PATHS +# ============================================================================= +scripts_dir: "/opt/scripts" +config_dir: "/opt/config" +logs_dir: "/var/log/daarion" +backup_dir: "/opt/backups" diff --git a/infrastructure/ansible/inventory/production.yml b/infrastructure/ansible/inventory/production.yml new file mode 100644 index 00000000..603fadf9 --- /dev/null +++ b/infrastructure/ansible/inventory/production.yml @@ -0,0 +1,65 @@ +# DAARION Network - Production Inventory +# Version: 1.0.0 +# Updated: 2026-01-10 + +all: + vars: + ansible_python_interpreter: /usr/bin/python3 + timezone: "UTC" + + # K3s configuration + k3s_version: "v1.29.0+k3s1" + k3s_token: "{{ vault_k3s_token }}" + + # Network + daarion_network_cidr: "10.42.0.0/16" + daarion_service_cidr: "10.43.0.0/16" + + children: + # Master nodes - control plane + masters: + hosts: + node1: + ansible_host: 144.76.224.179 + ansible_user: root + ansible_ssh_pass: "{{ vault_node1_password }}" + node_role: master + datacenter: hetzner-de + location: "Nuremberg, Germany" + + # Worker nodes - compute + workers: + hosts: + node3: + ansible_host: 80.77.35.151 + ansible_port: 33147 + ansible_user: zevs + ansible_become: yes + ansible_become_pass: "{{ vault_node3_password }}" + node_role: worker + datacenter: remote-dc + location: "Remote Datacenter" + gpu: true + gpu_type: "rtx3090" + gpu_memory: "24GB" + + # GPU nodes (subset of workers) + gpu_nodes: + hosts: + node3: + + # Database nodes + database_nodes: + hosts: + node1: + node3: + + # Local development + local_dev: + hosts: + node2: + ansible_host: localhost + ansible_connection: local + node_role: development + datacenter: local + location: "MacBook Pro M4" diff --git a/infrastructure/ansible/kubeconfig/.gitignore b/infrastructure/ansible/kubeconfig/.gitignore new file mode 100644 index 00000000..9e920465 --- /dev/null +++ b/infrastructure/ansible/kubeconfig/.gitignore @@ -0,0 +1,2 @@ +*.yaml +!.gitkeep diff --git a/infrastructure/ansible/kubeconfig/.gitkeep b/infrastructure/ansible/kubeconfig/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/infrastructure/ansible/playbooks/bootstrap.yml b/infrastructure/ansible/playbooks/bootstrap.yml new file mode 100644 index 00000000..85e4effb --- /dev/null +++ b/infrastructure/ansible/playbooks/bootstrap.yml @@ -0,0 +1,143 @@ +# DAARION Network - Bootstrap Playbook +# Initial setup for all nodes: packages, SSH, hostname, etc. +--- +- name: Bootstrap all nodes + hosts: all + become: yes + + vars: + common_packages: + - curl + - wget + - git + - htop + - vim + - jq + - unzip + - ca-certificates + - gnupg + - lsb-release + - net-tools + - dnsutils + - bc + + tasks: + # ========================================================================= + # BASIC SETUP + # ========================================================================= + - name: Set timezone + timezone: + name: "{{ timezone }}" + + - name: Set hostname + hostname: + name: "{{ inventory_hostname }}" + + - name: Update /etc/hosts with all nodes + lineinfile: + path: /etc/hosts + line: "{{ hostvars[item].ansible_host }} {{ item }}" + state: present + loop: "{{ groups['all'] }}" + when: + - hostvars[item].ansible_host is defined + - hostvars[item].ansible_host != 'localhost' + + # ========================================================================= + # PACKAGES + # ========================================================================= + - name: Update apt cache + apt: + update_cache: yes + cache_valid_time: 3600 + when: ansible_os_family == "Debian" + + - name: Upgrade all packages + apt: + upgrade: safe + when: ansible_os_family == "Debian" + + - name: Install common packages + apt: + name: "{{ common_packages }}" + state: present + when: ansible_os_family == "Debian" + + # ========================================================================= + # USERS & SSH + # ========================================================================= + - name: Create admin group + group: + name: daarion-admin + state: present + + - name: Create directories + file: + path: "{{ item }}" + state: directory + mode: '0755' + loop: + - "{{ scripts_dir }}" + - "{{ config_dir }}" + - "{{ logs_dir }}" + - "{{ backup_dir }}" + + # ========================================================================= + # SSH HARDENING + # ========================================================================= + - name: Disable root login via SSH (workers only) + lineinfile: + path: /etc/ssh/sshd_config + regexp: '^#?PermitRootLogin' + line: 'PermitRootLogin prohibit-password' + notify: restart sshd + when: "'workers' in group_names" + + - name: Set SSH MaxAuthTries + lineinfile: + path: /etc/ssh/sshd_config + regexp: '^#?MaxAuthTries' + line: 'MaxAuthTries 3' + notify: restart sshd + + - name: Set SSH ClientAliveInterval + lineinfile: + path: /etc/ssh/sshd_config + regexp: '^#?ClientAliveInterval' + line: 'ClientAliveInterval 300' + notify: restart sshd + + # ========================================================================= + # KERNEL PARAMETERS + # ========================================================================= + - name: Set kernel parameters for containers + sysctl: + name: "{{ item.name }}" + value: "{{ item.value }}" + state: present + reload: yes + loop: + - { name: 'net.ipv4.ip_forward', value: '1' } + - { name: 'net.bridge.bridge-nf-call-iptables', value: '1' } + - { name: 'net.bridge.bridge-nf-call-ip6tables', value: '1' } + - { name: 'fs.inotify.max_user_watches', value: '524288' } + - { name: 'fs.inotify.max_user_instances', value: '512' } + ignore_errors: yes # Some params may not exist on all systems + + # ========================================================================= + # VERIFICATION + # ========================================================================= + - name: Verify setup + debug: + msg: | + Node: {{ inventory_hostname }} + Host: {{ ansible_host }} + Datacenter: {{ datacenter | default('unknown') }} + Role: {{ node_role | default('unknown') }} + GPU: {{ gpu | default(false) }} + + handlers: + - name: restart sshd + service: + name: sshd + state: restarted diff --git a/infrastructure/ansible/playbooks/hardening.yml b/infrastructure/ansible/playbooks/hardening.yml new file mode 100644 index 00000000..be4059e6 --- /dev/null +++ b/infrastructure/ansible/playbooks/hardening.yml @@ -0,0 +1,288 @@ +# DAARION Network - Security Hardening Playbook +# Comprehensive security setup for all nodes +--- +- name: Security Hardening + hosts: all + become: yes + + vars: + allowed_ssh_port: "{{ ansible_port | default(22) }}" + + tasks: + # ========================================================================= + # SECURITY PACKAGES + # ========================================================================= + - name: Install security packages + apt: + name: "{{ security_packages }}" + state: present + when: ansible_os_family == "Debian" + + - name: Install Trivy (vulnerability scanner) + shell: | + curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sh -s -- -b /usr/local/bin + args: + creates: /usr/local/bin/trivy + + # ========================================================================= + # UFW FIREWALL + # ========================================================================= + - name: UFW - Reset to defaults + ufw: + state: reset + + - name: UFW - Default deny incoming + ufw: + direction: incoming + policy: deny + + - name: UFW - Default deny outgoing + ufw: + direction: outgoing + policy: deny + + - name: UFW - Allow SSH + ufw: + rule: allow + port: "{{ allowed_ssh_port }}" + proto: tcp + + - name: UFW - Allow necessary TCP ports + ufw: + rule: allow + port: "{{ item }}" + proto: tcp + loop: "{{ firewall_allowed_tcp_ports }}" + when: firewall_allowed_tcp_ports is defined + + - name: UFW - Allow necessary outgoing + ufw: + rule: allow + direction: out + port: "{{ item.port }}" + proto: "{{ item.proto }}" + loop: "{{ firewall_allowed_outgoing }}" + + - name: UFW - Block internal networks + ufw: + rule: deny + direction: out + to_ip: "{{ item }}" + loop: "{{ firewall_blocked_networks }}" + when: firewall_blocked_networks is defined + + - name: UFW - Enable + ufw: + state: enabled + + # ========================================================================= + # FAIL2BAN + # ========================================================================= + - name: Configure fail2ban + copy: + dest: /etc/fail2ban/jail.local + content: | + [DEFAULT] + bantime = 3600 + findtime = 600 + maxretry = 3 + + [sshd] + enabled = true + port = {{ allowed_ssh_port }} + filter = sshd + logpath = /var/log/auth.log + maxretry = 3 + bantime = 86400 + notify: restart fail2ban + + - name: Enable fail2ban + service: + name: fail2ban + enabled: yes + state: started + + # ========================================================================= + # AUDITD + # ========================================================================= + - name: Configure auditd rules + copy: + dest: /etc/audit/rules.d/daarion.rules + content: | + # Monitor file changes in critical directories + -w /etc/passwd -p wa -k passwd_changes + -w /etc/shadow -p wa -k shadow_changes + -w /etc/ssh/sshd_config -p wa -k sshd_config + + # Monitor Docker + -w /var/lib/docker -p wa -k docker + -w /etc/docker -p wa -k docker_config + + # Monitor cron + -w /etc/crontab -p wa -k cron + -w /etc/cron.d -p wa -k cron + + # Monitor tmp (malware indicator) + -w /tmp -p x -k tmp_exec + -w /var/tmp -p x -k var_tmp_exec + notify: restart auditd + + - name: Enable auditd + service: + name: auditd + enabled: yes + state: started + + # ========================================================================= + # KERNEL HARDENING + # ========================================================================= + - name: Kernel security parameters + sysctl: + name: "{{ item.name }}" + value: "{{ item.value }}" + state: present + reload: yes + loop: + - { name: 'net.ipv4.conf.all.accept_redirects', value: '0' } + - { name: 'net.ipv4.conf.default.accept_redirects', value: '0' } + - { name: 'net.ipv4.conf.all.send_redirects', value: '0' } + - { name: 'net.ipv4.conf.default.send_redirects', value: '0' } + - { name: 'net.ipv4.tcp_syncookies', value: '1' } + - { name: 'net.ipv4.icmp_echo_ignore_broadcasts', value: '1' } + - { name: 'kernel.randomize_va_space', value: '2' } + - { name: 'kernel.kptr_restrict', value: '2' } + - { name: 'kernel.dmesg_restrict', value: '1' } + + # ========================================================================= + # SECURITY CHECK SCRIPT + # ========================================================================= + - name: Deploy security check script + copy: + dest: "{{ scripts_dir }}/security-check.sh" + mode: '0755' + content: | + #!/bin/bash + # DAARION Security Check Script + # Runs hourly via cron + + LOG="{{ logs_dir }}/security-$(date +%Y%m%d).log" + ALERT_FILE="/tmp/security_alert" + + log() { + echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG" + } + + log "=== Security Check Started ===" + + # Check for suspicious processes + SUSPICIOUS=$(ps aux | grep -E "(xmrig|kdevtmp|kinsing|perfctl|httpd.*tmp|mysql.*tmp)" | grep -v grep) + if [ -n "$SUSPICIOUS" ]; then + log "CRITICAL: Suspicious process detected!" + log "$SUSPICIOUS" + pkill -9 -f "xmrig|kdevtmp|kinsing|perfctl" + touch "$ALERT_FILE" + fi + + # Check for executables in /tmp + TMP_EXEC=$(find /tmp /var/tmp /dev/shm -type f -executable 2>/dev/null) + if [ -n "$TMP_EXEC" ]; then + log "WARNING: Executable files in tmp directories!" + log "$TMP_EXEC" + rm -f $TMP_EXEC 2>/dev/null + fi + + # Check CPU usage (potential mining) + LOAD=$(cat /proc/loadavg | cut -d' ' -f1) + CPU_COUNT=$(nproc) + THRESHOLD=$(echo "$CPU_COUNT * 2" | bc) + if (( $(echo "$LOAD > $THRESHOLD" | bc -l) )); then + log "WARNING: High CPU load: $LOAD (threshold: $THRESHOLD)" + fi + + # Check for unauthorized SSH keys + for user_home in /root /home/*; do + if [ -f "$user_home/.ssh/authorized_keys" ]; then + KEY_COUNT=$(wc -l < "$user_home/.ssh/authorized_keys") + log "INFO: $user_home has $KEY_COUNT SSH keys" + fi + done + + # Check failed SSH attempts + FAILED_SSH=$(grep "Failed password" /var/log/auth.log 2>/dev/null | wc -l) + log "INFO: Failed SSH attempts today: $FAILED_SSH" + + # Check Docker containers + if command -v docker &> /dev/null; then + CONTAINER_COUNT=$(docker ps -q | wc -l) + log "INFO: Running Docker containers: $CONTAINER_COUNT" + + # Check for containers running as root + docker ps -q | while read cid; do + USER=$(docker inspect --format '{{.Config.User}}' $cid) + NAME=$(docker inspect --format '{{.Name}}' $cid) + if [ -z "$USER" ] || [ "$USER" = "root" ] || [ "$USER" = "0" ]; then + log "WARNING: Container $NAME running as root" + fi + done + fi + + log "=== Security Check Completed ===" + + - name: Setup security cron + cron: + name: "Hourly security check" + minute: "0" + job: "{{ scripts_dir }}/security-check.sh" + + - name: Setup daily rkhunter scan + cron: + name: "Daily rkhunter scan" + hour: "3" + minute: "0" + job: "rkhunter --update && rkhunter --check --skip-keypress > {{ logs_dir }}/rkhunter.log 2>&1" + + # ========================================================================= + # AUTO UPDATES + # ========================================================================= + - name: Configure unattended-upgrades + copy: + dest: /etc/apt/apt.conf.d/50unattended-upgrades + content: | + Unattended-Upgrade::Allowed-Origins { + "${distro_id}:${distro_codename}"; + "${distro_id}:${distro_codename}-security"; + "${distro_id}ESMApps:${distro_codename}-apps-security"; + "${distro_id}ESM:${distro_codename}-infra-security"; + }; + Unattended-Upgrade::AutoFixInterruptedDpkg "true"; + Unattended-Upgrade::Remove-Unused-Dependencies "true"; + Unattended-Upgrade::Automatic-Reboot "false"; + when: ansible_os_family == "Debian" + + # ========================================================================= + # VERIFICATION + # ========================================================================= + - name: Verify security setup + shell: | + echo "=== Security Status ===" + echo "UFW: $(ufw status | head -1)" + echo "Fail2ban: $(systemctl is-active fail2ban)" + echo "Auditd: $(systemctl is-active auditd)" + echo "Trivy: $(trivy --version 2>/dev/null | head -1 || echo 'not installed')" + register: security_status + changed_when: false + + - name: Show security status + debug: + var: security_status.stdout_lines + + handlers: + - name: restart fail2ban + service: + name: fail2ban + state: restarted + + - name: restart auditd + service: + name: auditd + state: restarted diff --git a/infrastructure/ansible/playbooks/k3s-install.yml b/infrastructure/ansible/playbooks/k3s-install.yml new file mode 100644 index 00000000..1eb6324f --- /dev/null +++ b/infrastructure/ansible/playbooks/k3s-install.yml @@ -0,0 +1,183 @@ +# DAARION Network - K3s Installation Playbook +# Lightweight Kubernetes cluster setup +--- +# ============================================================================= +# INSTALL K3S SERVER (MASTERS) +# ============================================================================= +- name: Install K3s Server on Masters + hosts: masters + become: yes + + tasks: + - name: Check if K3s is already installed + stat: + path: /etc/rancher/k3s/k3s.yaml + register: k3s_installed + + - name: Download K3s installer + get_url: + url: https://get.k3s.io + dest: /tmp/k3s-install.sh + mode: '0755' + when: not k3s_installed.stat.exists + + - name: Install K3s server + shell: | + INSTALL_K3S_VERSION={{ k3s_version }} \ + sh /tmp/k3s-install.sh server \ + --disable traefik \ + --disable servicelb \ + --write-kubeconfig-mode 644 \ + --tls-san {{ ansible_host }} \ + --tls-san {{ inventory_hostname }} \ + --node-label "datacenter={{ datacenter }}" \ + --node-label "node-role={{ node_role }}" \ + --cluster-cidr {{ daarion_network_cidr | default('10.42.0.0/16') }} \ + --service-cidr {{ daarion_service_cidr | default('10.43.0.0/16') }} + args: + creates: /etc/rancher/k3s/k3s.yaml + register: k3s_install + + - name: Wait for K3s to be ready + wait_for: + port: 6443 + delay: 10 + timeout: 300 + + - name: Wait for node to be ready + shell: | + export KUBECONFIG=/etc/rancher/k3s/k3s.yaml + kubectl wait --for=condition=Ready node/{{ inventory_hostname }} --timeout=300s + register: node_ready + retries: 10 + delay: 10 + until: node_ready.rc == 0 + + - name: Get K3s token + slurp: + src: /var/lib/rancher/k3s/server/node-token + register: k3s_token_file + + - name: Save K3s token as fact + set_fact: + k3s_join_token: "{{ k3s_token_file.content | b64decode | trim }}" + + - name: Fetch kubeconfig + fetch: + src: /etc/rancher/k3s/k3s.yaml + dest: "{{ playbook_dir }}/../kubeconfig/{{ inventory_hostname }}.yaml" + flat: yes + + - name: Update kubeconfig with external IP + delegate_to: localhost + become: no + replace: + path: "{{ playbook_dir }}/../kubeconfig/{{ inventory_hostname }}.yaml" + regexp: '127.0.0.1' + replace: "{{ ansible_host }}" + + - name: Show K3s status + shell: | + export KUBECONFIG=/etc/rancher/k3s/k3s.yaml + kubectl get nodes -o wide + register: k3s_status + changed_when: false + + - name: Display K3s status + debug: + var: k3s_status.stdout_lines + +# ============================================================================= +# INSTALL K3S AGENT (WORKERS) +# ============================================================================= +- name: Install K3s Agent on Workers + hosts: workers + become: yes + + vars: + k3s_master_host: "{{ hostvars[groups['masters'][0]].ansible_host }}" + k3s_master_token: "{{ hostvars[groups['masters'][0]].k3s_join_token }}" + + tasks: + - name: Check if K3s agent is already installed + stat: + path: /var/lib/rancher/k3s/agent + register: k3s_agent_installed + + - name: Download K3s installer + get_url: + url: https://get.k3s.io + dest: /tmp/k3s-install.sh + mode: '0755' + when: not k3s_agent_installed.stat.exists + + - name: Build node labels + set_fact: + node_labels: >- + --node-label datacenter={{ datacenter }} + --node-label node-role={{ node_role }} + {% if gpu is defined and gpu %} + --node-label gpu=true + --node-label gpu-type={{ gpu_type | default('unknown') }} + --node-label gpu-memory={{ gpu_memory | default('unknown') }} + {% endif %} + + - name: Install K3s agent + shell: | + INSTALL_K3S_VERSION={{ k3s_version }} \ + K3S_URL=https://{{ k3s_master_host }}:6443 \ + K3S_TOKEN={{ k3s_master_token }} \ + sh /tmp/k3s-install.sh agent \ + {{ node_labels }} + args: + creates: /var/lib/rancher/k3s/agent + register: k3s_agent_install + + - name: Wait for agent to connect + pause: + seconds: 30 + when: k3s_agent_install.changed + +# ============================================================================= +# VERIFY CLUSTER +# ============================================================================= +- name: Verify K3s Cluster + hosts: masters + become: yes + + tasks: + - name: Get cluster nodes + shell: | + export KUBECONFIG=/etc/rancher/k3s/k3s.yaml + kubectl get nodes -o wide + register: cluster_nodes + changed_when: false + + - name: Display cluster nodes + debug: + var: cluster_nodes.stdout_lines + + - name: Get cluster info + shell: | + export KUBECONFIG=/etc/rancher/k3s/k3s.yaml + kubectl cluster-info + register: cluster_info + changed_when: false + + - name: Display cluster info + debug: + var: cluster_info.stdout_lines + + - name: Create daarion namespace + shell: | + export KUBECONFIG=/etc/rancher/k3s/k3s.yaml + kubectl create namespace daarion --dry-run=client -o yaml | kubectl apply -f - + changed_when: false + + - name: Label GPU nodes + shell: | + export KUBECONFIG=/etc/rancher/k3s/k3s.yaml + kubectl label nodes {{ item }} nvidia.com/gpu=true --overwrite + loop: "{{ groups['gpu_nodes'] | default([]) }}" + when: groups['gpu_nodes'] is defined + ignore_errors: yes