mirror_zfs/.github/workflows/scripts/qemu-7-prepare.sh

#!/usr/bin/env bash

######################################################################
# 7) prepare output of the results
# - this script pre-creates all needed logfiles for later summary
######################################################################

set -eu

# read our defined variables
cd /var/tmp
source env.txt

mkdir -p $RESPATH

# check if building the module has failed
if [ -z ${VMs:-} ]; then
  cd $RESPATH
  echo ":exclamation: ZFS module didn't build successfully :exclamation:" \
    | tee summary.txt | tee /tmp/summary.txt
  tar cf /tmp/qemu-$OS.tar -C $RESPATH -h . || true
  exit 0
fi

# build was okay
BASE="$HOME/work/zfs/zfs"
MERGE="$BASE/.github/workflows/scripts/merge_summary.awk"

# catch result files of testings (vm's should be there)
for i in $(seq 1 $VMs); do
  rsync -arL zfs@192.168.122.1$i:$RESPATH/current $RESPATH/vm$i || true
  scp zfs@192.168.122.1$i:"/var/tmp/*.txt" $RESPATH/vm$i || true
done
cp -f /var/tmp/*.txt $RESPATH || true
cd $RESPATH

# prepare result files for summary
for i in $(seq 1 $VMs); do
  file="vm$i/build-stderr.txt"
  test -s $file && mv -f $file build-stderr.txt

  file="vm$i/build-exitcode.txt"
  test -s $file && mv -f $file build-exitcode.txt

  file="vm$i/uname.txt"
  test -s $file && mv -f $file uname.txt

  file="vm$i/tests-exitcode.txt"
  if [ ! -s $file ]; then
    # XXX - add some tests for kernel panic's here
    # tail -n 80 vm$i/console.txt | grep XYZ
    echo 1 > $file
  fi
  rv=$(cat vm$i/tests-exitcode.txt)
  test $rv != 0 && touch /tmp/have_failed_tests

  file="vm$i/current/log"
  if [ -s $file ]; then
    cat $file >> log
    awk '/\[FAIL\]|\[KILLED\]/{ show=1; print; next; }; \
      /\[SKIP\]|\[PASS\]/{ show=0; } show' \
      $file > /tmp/vm${i}dbg.txt
  fi

  file="vm${i}log.txt"
  fileC="/tmp/vm${i}log.txt"
  if [ -s $file ]; then
    cat $file >> summary
    cat $file | $BASE/scripts/zfs-tests-color.sh > $fileC
  fi
done

# create summary of tests
if [ -s summary ]; then
  $MERGE summary | grep -v '^/' > summary.txt
  $MERGE summary | $BASE/scripts/zfs-tests-color.sh > /tmp/summary.txt
  rm -f summary
else
  touch summary.txt /tmp/summary.txt
fi

# create file for debugging
if [ -s log ]; then
  awk '/\[FAIL\]|\[KILLED\]/{ show=1; print; next; }; \
    /\[SKIP\]|\[PASS\]/{ show=0; } show' \
    log > summary-failure-logs.txt
  rm -f log
else
  touch summary-failure-logs.txt
fi

# create debug overview for failed tests
cat summary.txt \
  | awk '/\(expected PASS\)/{ if ($1!="SKIP") print $2; next; } show' \
  | while read t; do
  cat summary-failure-logs.txt \
    | awk '$0~/Test[: ]/{ show=0; } $0~v{ show=1; } show' v="$t" \
    > /tmp/fail.txt
  SIZE=$(stat --printf="%s" /tmp/fail.txt)
  SIZE=$((SIZE/1024))
  # Test Summary:
  echo "##[group]$t ($SIZE KiB)" >> /tmp/failed.txt
  cat /tmp/fail.txt | $BASE/scripts/zfs-tests-color.sh >> /tmp/failed.txt
  echo "##[endgroup]" >> /tmp/failed.txt
  # Job Summary:
  echo -e "\n<details>\n<summary>$t ($SIZE KiB)</summary><pre>" >> failed.txt
  cat /tmp/fail.txt >> failed.txt
  echo "</pre></details>" >> failed.txt
done

if [ -e /tmp/have_failed_tests ]; then
  echo ":warning: Some tests failed!" >> failed.txt
else
  echo ":thumbsup: All tests passed." >> failed.txt
fi

if [ ! -s uname.txt ]; then
  echo ":interrobang: Panic - where is my uname.txt?" > uname.txt
fi

# artifact ready now
tar cf /tmp/qemu-$OS.tar -C $RESPATH -h . || true
ZTS: Use QEMU for tests on Linux and FreeBSD This commit adds functional tests for these systems: - AlmaLinux 8, AlmaLinux 9, ArchLinux - CentOS Stream 9, Fedora 39, Fedora 40 - Debian 11, Debian 12 - FreeBSD 13, FreeBSD 14, FreeBSD 15 - Ubuntu 20.04, Ubuntu 22.04, Ubuntu 24.04 - enabled by default: - AlmaLinux 8, AlmaLinux 9 - Debian 11, Debian 12 - Fedora 39, Fedora 40 - FreeBSD 13, FreeBSD 14 Workflow for each operating system: - install qemu on the github runner - download current cloud image of operating system - start and init that image via cloud-init - install dependencies and poweroff system - start system and build openzfs and then poweroff again - clone build system and start 2 instances of it - run functional testings and complete in around 3h - when tests are done, do some logfile preparing - show detailed results for each system - in the end, generate the job summary Real-world benefits from this PR: 1. The github runner scripts are in the zfs repo itself. That means you can just open a PR against zfs, like "Add Fedora 41 tester", and see the results directly in the PR. ZFS admins no longer need manually to login to the buildbot server to update the buildbot config with new version of Fedora/Almalinux. 2. Github runners allow you to run the entire test suite against your private branch before submitting a formal PR to openzfs. Just open a PR against your private zfs repo, and the exact same Fedora/Alma/FreeBSD runners will fire up and run ZTS. This can be useful if you want to iterate on a ZTS change before submitting a formal PR. 3. buildbot is incredibly cumbersome. Our buildbot config files alone are ~1500 lines (not including any build/setup scripts)! It's a huge pain to setup. 4. We're running the super ancient buildbot 0.8.12. It's so ancient it requires python2. We actually have to build python2 from source for almalinux9 just to get it to run. Ugrading to a more modern buildbot is a huge undertaking, and the UI on the newer versions is worse. 5. Buildbot uses EC2 instances. EC2 is a pain because: * It costs money * They throttle IOPS and CPU usage, leading to mysterious, * hard-to-diagnose, failures and timeouts in ZTS. * EC2 is high maintenance. We have to setup security groups, SSH * keys, networking, users, etc, in AWS and it's a pain. We also * have to periodically go in an kill zombie EC2 instances that * buildbot is unable to kill off. 6. Buildbot doesn't always handle failures well. One of the things we saw in the past was the FreeBSD builders would often die, and each builder death would take up a "slot" in buildbot. So we would periodically have to restart buildbot via a cron job to get the slots back. 7. This PR divides up the ZTS test list into two parts, launches two VMs, and on each VM runs half the test suite. The test results are then merged and shown in the sumary page. So we're basically parallelizing ZTS on the same github runner. This leads to lower overall ZTS runtimes (2.5-3 hours vs 4+ hours on buildbot), and one unified set of results per runner, which is nice. 8. Since the tests are running on a VM, we have much more control over what happens. We can capture the serial console output even if the test completely brings down the VM. In the future, we could also restart the test on the VM where it left off, so that if a single test panics the VM, we can just restart it and run the remaining ZTS tests (this functionaly is not yet implemented though, just an idea). 9. Using the runners, users can manually kill or restart a test run via the github IU. That really isn't possible with buildbot unless you're an admin. 10. Anecdotally, the tests seem to be more stable and constant under the QEMU runners. Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Closes #16537 2024-06-17 17:52:58 +03:00			`#!/usr/bin/env bash`

			`######################################################################`
			`# 7) prepare output of the results`
			`# - this script pre-creates all needed logfiles for later summary`
			`######################################################################`

			`set -eu`

			`# read our defined variables`
			`cd /var/tmp`
			`source env.txt`

			`mkdir -p $RESPATH`

			`# check if building the module has failed`
			`if [ -z ${VMs:-} ]; then`
			`cd $RESPATH`
			`echo ":exclamation: ZFS module didn't build successfully :exclamation:" \`
			`\| tee summary.txt \| tee /tmp/summary.txt`
			`tar cf /tmp/qemu-$OS.tar -C $RESPATH -h . \|\| true`
			`exit 0`
			`fi`

			`# build was okay`
			`BASE="$HOME/work/zfs/zfs"`
			`MERGE="$BASE/.github/workflows/scripts/merge_summary.awk"`

			`# catch result files of testings (vm's should be there)`
			`for i in $(seq 1 $VMs); do`
			`rsync -arL zfs@192.168.122.1$i:$RESPATH/current $RESPATH/vm$i \|\| true`
			`scp zfs@192.168.122.1$i:"/var/tmp/*.txt" $RESPATH/vm$i \|\| true`
			`done`
			`cp -f /var/tmp/*.txt $RESPATH \|\| true`
			`cd $RESPATH`

			`# prepare result files for summary`
			`for i in $(seq 1 $VMs); do`
			`file="vm$i/build-stderr.txt"`
			`test -s $file && mv -f $file build-stderr.txt`

			`file="vm$i/build-exitcode.txt"`
			`test -s $file && mv -f $file build-exitcode.txt`

			`file="vm$i/uname.txt"`
			`test -s $file && mv -f $file uname.txt`

			`file="vm$i/tests-exitcode.txt"`
ZTS: Fix Test Summary page generation Fix that error: "cat /tmp/failed.txt: No such file or directory" Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de> Closes #16549 2024-09-18 07:40:49 +03:00			`if [ ! -s $file ]; then`
			`# XXX - add some tests for kernel panic's here`
			`# tail -n 80 vm$i/console.txt \| grep XYZ`
			`echo 1 > $file`
			`fi`
ZTS: Use QEMU for tests on Linux and FreeBSD This commit adds functional tests for these systems: - AlmaLinux 8, AlmaLinux 9, ArchLinux - CentOS Stream 9, Fedora 39, Fedora 40 - Debian 11, Debian 12 - FreeBSD 13, FreeBSD 14, FreeBSD 15 - Ubuntu 20.04, Ubuntu 22.04, Ubuntu 24.04 - enabled by default: - AlmaLinux 8, AlmaLinux 9 - Debian 11, Debian 12 - Fedora 39, Fedora 40 - FreeBSD 13, FreeBSD 14 Workflow for each operating system: - install qemu on the github runner - download current cloud image of operating system - start and init that image via cloud-init - install dependencies and poweroff system - start system and build openzfs and then poweroff again - clone build system and start 2 instances of it - run functional testings and complete in around 3h - when tests are done, do some logfile preparing - show detailed results for each system - in the end, generate the job summary Real-world benefits from this PR: 1. The github runner scripts are in the zfs repo itself. That means you can just open a PR against zfs, like "Add Fedora 41 tester", and see the results directly in the PR. ZFS admins no longer need manually to login to the buildbot server to update the buildbot config with new version of Fedora/Almalinux. 2. Github runners allow you to run the entire test suite against your private branch before submitting a formal PR to openzfs. Just open a PR against your private zfs repo, and the exact same Fedora/Alma/FreeBSD runners will fire up and run ZTS. This can be useful if you want to iterate on a ZTS change before submitting a formal PR. 3. buildbot is incredibly cumbersome. Our buildbot config files alone are ~1500 lines (not including any build/setup scripts)! It's a huge pain to setup. 4. We're running the super ancient buildbot 0.8.12. It's so ancient it requires python2. We actually have to build python2 from source for almalinux9 just to get it to run. Ugrading to a more modern buildbot is a huge undertaking, and the UI on the newer versions is worse. 5. Buildbot uses EC2 instances. EC2 is a pain because: * It costs money * They throttle IOPS and CPU usage, leading to mysterious, * hard-to-diagnose, failures and timeouts in ZTS. * EC2 is high maintenance. We have to setup security groups, SSH * keys, networking, users, etc, in AWS and it's a pain. We also * have to periodically go in an kill zombie EC2 instances that * buildbot is unable to kill off. 6. Buildbot doesn't always handle failures well. One of the things we saw in the past was the FreeBSD builders would often die, and each builder death would take up a "slot" in buildbot. So we would periodically have to restart buildbot via a cron job to get the slots back. 7. This PR divides up the ZTS test list into two parts, launches two VMs, and on each VM runs half the test suite. The test results are then merged and shown in the sumary page. So we're basically parallelizing ZTS on the same github runner. This leads to lower overall ZTS runtimes (2.5-3 hours vs 4+ hours on buildbot), and one unified set of results per runner, which is nice. 8. Since the tests are running on a VM, we have much more control over what happens. We can capture the serial console output even if the test completely brings down the VM. In the future, we could also restart the test on the VM where it left off, so that if a single test panics the VM, we can just restart it and run the remaining ZTS tests (this functionaly is not yet implemented though, just an idea). 9. Using the runners, users can manually kill or restart a test run via the github IU. That really isn't possible with buildbot unless you're an admin. 10. Anecdotally, the tests seem to be more stable and constant under the QEMU runners. Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Closes #16537 2024-06-17 17:52:58 +03:00			`rv=$(cat vm$i/tests-exitcode.txt)`
			`test $rv != 0 && touch /tmp/have_failed_tests`

			`file="vm$i/current/log"`
			`if [ -s $file ]; then`
			`cat $file >> log`
			`awk '/\[FAIL\]\|\[KILLED\]/{ show=1; print; next; }; \`
			`/\[SKIP\]\|\[PASS\]/{ show=0; } show' \`
			`$file > /tmp/vm${i}dbg.txt`
			`fi`

			`file="vm${i}log.txt"`
			`fileC="/tmp/vm${i}log.txt"`
			`if [ -s $file ]; then`
			`cat $file >> summary`
			`cat $file \| $BASE/scripts/zfs-tests-color.sh > $fileC`
			`fi`
			`done`

			`# create summary of tests`
			`if [ -s summary ]; then`
			`$MERGE summary \| grep -v '^/' > summary.txt`
			`$MERGE summary \| $BASE/scripts/zfs-tests-color.sh > /tmp/summary.txt`
			`rm -f summary`
			`else`
			`touch summary.txt /tmp/summary.txt`
			`fi`

			`# create file for debugging`
			`if [ -s log ]; then`
			`awk '/\[FAIL\]\|\[KILLED\]/{ show=1; print; next; }; \`
			`/\[SKIP\]\|\[PASS\]/{ show=0; } show' \`
			`log > summary-failure-logs.txt`
			`rm -f log`
			`else`
			`touch summary-failure-logs.txt`
			`fi`

			`# create debug overview for failed tests`
			`cat summary.txt \`
			`\| awk '/\(expected PASS\)/{ if ($1!="SKIP") print $2; next; } show' \`
			`\| while read t; do`
			`cat summary-failure-logs.txt \`
			`\| awk '$0~/Test[: ]/{ show=0; } $0~v{ show=1; } show' v="$t" \`
			`> /tmp/fail.txt`
			`SIZE=$(stat --printf="%s" /tmp/fail.txt)`
			`SIZE=$((SIZE/1024))`
			`# Test Summary:`
			`echo "##[group]$t ($SIZE KiB)" >> /tmp/failed.txt`
			`cat /tmp/fail.txt \| $BASE/scripts/zfs-tests-color.sh >> /tmp/failed.txt`
			`echo "##[endgroup]" >> /tmp/failed.txt`
			`# Job Summary:`
			`echo -e "\n<details>\n<summary>$t ($SIZE KiB)</summary><pre>" >> failed.txt`
			`cat /tmp/fail.txt >> failed.txt`
			`echo "</pre></details>" >> failed.txt`
			`done`

			`if [ -e /tmp/have_failed_tests ]; then`
			`echo ":warning: Some tests failed!" >> failed.txt`
			`else`
			`echo ":thumbsup: All tests passed." >> failed.txt`
			`fi`

			`if [ ! -s uname.txt ]; then`
			`echo ":interrobang: Panic - where is my uname.txt?" > uname.txt`
			`fi`

			`# artifact ready now`
			`tar cf /tmp/qemu-$OS.tar -C $RESPATH -h . \|\| true`