hs-test: CPU allocation improvements - Release build runs on numa node0, debug on node1. Using the last digit of a build number to reserve 4 cores per test mmeans we can run 20 jobs (10 release, 10 debug) on the same machine, assuming we have 111 cores available (not counting core 0). Can be increased if needed, there are still some cores left. - Added separate numa aware cpu allocation - Added CPU0=true|false (useful for users with 4c/8t) Type: test Change-Id: Iba8e492a4e01a7f457e49112303887a2a27f6af9 Signed-off-by: Adrian Villin <avillin@cisco.com>

commit: 5d171ebdc21efa4085e2c2130f595d7e0e1d2f59 [log] [tgz]
author: Adrian Villin <avillin@cisco.com> Mon Jun 17 08:51:27 2024 +0200
committer: Dave Wallace <dwallacelf@gmail.com> Mon Jul 08 16:27:38 2024 +0000
tree: 9b1e967d20e8c83f9780d5a76579db0edd3c5373
parent: 75e8e1e948da182dbf4f6b3394f1b7fc44c1403a [diff] [blame]
diff --git a/extras/hs-test/infra/cpu.go b/extras/hs-test/infra/cpu.go
index b5555d8..b26a06c 100644
--- a/extras/hs-test/infra/cpu.go
+++ b/extras/hs-test/infra/cpu.go

@@ -7,6 +7,7 @@
 	. "github.com/onsi/ginkgo/v2"
 	"os"
 	"os/exec"
+	"strconv"
 	"strings"
 )
 
@@ -18,80 +19,188 @@
 }
 
 type CpuAllocatorT struct {
-	cpus []int
+	cpus              []int
+	runningInCi       bool
+	buildNumber       int
+	maxContainerCount int
+}
+
+func iterateAndAppend(start int, end int, slice []int) []int {
+	for i := start; i <= end; i++ {
+		slice = append(slice, i)
+	}
+	return slice
 }
 
 var cpuAllocator *CpuAllocatorT = nil
 
 func (c *CpuAllocatorT) Allocate(containerCount int, nCpus int) (*CpuContext, error) {
 	var cpuCtx CpuContext
+	// indexes, not actual cores
+	var minCpu, maxCpu int
 
-	// splitting cpus into equal parts; this will over-allocate cores but it's good enough for now
-	maxContainerCount := 4
-	// skip CPU 0
-	minCpu := ((GinkgoParallelProcess() - 1) * maxContainerCount * nCpus) + 1
-	maxCpu := (GinkgoParallelProcess() * maxContainerCount * nCpus)
+	if c.runningInCi {
+		minCpu = ((c.buildNumber) * c.maxContainerCount * nCpus)
+		maxCpu = ((c.buildNumber + 1) * c.maxContainerCount * nCpus) - 1
+	} else {
+		minCpu = ((GinkgoParallelProcess() - 1) * c.maxContainerCount * nCpus)
+		maxCpu = (GinkgoParallelProcess() * c.maxContainerCount * nCpus) - 1
+	}
 
 	if len(c.cpus)-1 < maxCpu {
-		err := fmt.Errorf("could not allocate %d CPUs; available: %d; attempted to allocate cores %d-%d",
-			nCpus*containerCount, len(c.cpus)-1, minCpu, maxCpu)
+		err := fmt.Errorf("could not allocate %d CPUs; available count: %d; attempted to allocate cores with index %d-%d; max index: %d;\n"+
+			"available cores: %v", nCpus*containerCount, len(c.cpus), minCpu, maxCpu, len(c.cpus)-1, c.cpus)
 		return nil, err
 	}
+
 	if containerCount == 1 {
 		cpuCtx.cpus = c.cpus[minCpu : minCpu+nCpus]
-	} else if containerCount > 1 && containerCount <= maxContainerCount {
+	} else if containerCount > 1 && containerCount <= c.maxContainerCount {
 		cpuCtx.cpus = c.cpus[minCpu+(nCpus*(containerCount-1)) : minCpu+(nCpus*containerCount)]
 	} else {
-		return nil, fmt.Errorf("too many containers; CPU allocation for >%d containers is not implemented", maxContainerCount)
+		return nil, fmt.Errorf("too many containers; CPU allocation for >%d containers is not implemented", c.maxContainerCount)
 	}
-
 	cpuCtx.cpuAllocator = c
 	return &cpuCtx, nil
 }
 
 func (c *CpuAllocatorT) readCpus() error {
-	var first, last int
+	var first, second, third, fourth int
+	var file *os.File
+	var err error
 
-	// Path depends on cgroup version. We need to check which version is in use.
-	// For that following command can be used: 'stat -fc %T /sys/fs/cgroup/'
-	// In case the output states 'cgroup2fs' then cgroups v2 is used, 'tmpfs' in case cgroups v1.
-	cmd := exec.Command("stat", "-fc", "%T", "/sys/fs/cgroup/")
-	byteOutput, err := cmd.CombinedOutput()
-	if err != nil {
-		return err
-	}
-	CpuPath := CgroupPath
-	if strings.Contains(string(byteOutput), "tmpfs") {
-		CpuPath += "cpuset/cpuset.effective_cpus"
-	} else if strings.Contains(string(byteOutput), "cgroup2fs") {
-		CpuPath += "cpuset.cpus.effective"
+	if c.runningInCi {
+		// non-debug build runs on node0, debug on node1
+		if *IsDebugBuild {
+			file, err = os.Open("/sys/devices/system/node/node1/cpulist")
+		} else {
+			file, err = os.Open("/sys/devices/system/node/node0/cpulist")
+		}
+		if err != nil {
+			return err
+		}
+		defer file.Close()
+
+		sc := bufio.NewScanner(file)
+		sc.Scan()
+		line := sc.Text()
+		_, err = fmt.Sscanf(line, "%d-%d,%d-%d", &first, &second, &third, &fourth)
+		if err != nil {
+			return err
+		}
+
+		c.cpus = iterateAndAppend(first, second, c.cpus)
+		c.cpus = iterateAndAppend(third, fourth, c.cpus)
+	} else if NumaAwareCpuAlloc {
+		var fifth, sixth int
+		var tmpCpus []int
+
+		file, err := os.Open("/sys/devices/system/node/online")
+		if err != nil {
+			return err
+		}
+		defer file.Close()
+
+		sc := bufio.NewScanner(file)
+		sc.Scan()
+		line := sc.Text()
+		// get numa node range
+		_, err = fmt.Sscanf(line, "%d-%d", &first, &second)
+		if err != nil {
+			return err
+		}
+
+		for i := first; i <= second; i++ {
+			file, err := os.Open("/sys/devices/system/node/node" + fmt.Sprint(i) + "/cpulist")
+			if err != nil {
+				return err
+			}
+			defer file.Close()
+
+			// get numa node cores
+			sc := bufio.NewScanner(file)
+			sc.Scan()
+			line := sc.Text()
+			_, err = fmt.Sscanf(line, "%d-%d,%d-%d", &third, &fourth, &fifth, &sixth)
+			if err != nil {
+				return err
+			}
+
+			// get numa node cores from first range
+			tmpCpus = iterateAndAppend(third, fourth, tmpCpus)
+
+			// discard cpu 0
+			if tmpCpus[0] == 0 && !*UseCpu0{
+				tmpCpus = tmpCpus[1:]
+			}
+
+			// get numa node cores from second range
+			tmpCpus = iterateAndAppend(fifth, sixth, tmpCpus)
+
+			// make c.cpus divisible by maxContainerCount * nCpus, so we don't have to check which numa will be used
+			// and we can use offsets
+			count_to_remove := len(tmpCpus) % (c.maxContainerCount * *NConfiguredCpus)
+			c.cpus = append(c.cpus, tmpCpus[:len(tmpCpus)-count_to_remove]...)
+			tmpCpus = tmpCpus[:0]
+		}
 	} else {
-		return errors.New("cgroup unknown fs: " + string(byteOutput))
+		// Path depends on cgroup version. We need to check which version is in use.
+		// For that following command can be used: 'stat -fc %T /sys/fs/cgroup/'
+		// In case the output states 'cgroup2fs' then cgroups v2 is used, 'tmpfs' in case cgroups v1.
+		cmd := exec.Command("stat", "-fc", "%T", "/sys/fs/cgroup/")
+		byteOutput, err := cmd.CombinedOutput()
+		if err != nil {
+			return err
+		}
+
+		CpuPath := CgroupPath
+		if strings.Contains(string(byteOutput), "tmpfs") {
+			CpuPath += "cpuset/cpuset.effective_cpus"
+		} else if strings.Contains(string(byteOutput), "cgroup2fs") {
+			CpuPath += "cpuset.cpus.effective"
+		} else {
+			return errors.New("cgroup unknown fs: " + string(byteOutput))
+		}
+
+		file, err := os.Open(CpuPath)
+		if err != nil {
+			return err
+		}
+		defer file.Close()
+
+		sc := bufio.NewScanner(file)
+		sc.Scan()
+		line := sc.Text()
+		_, err = fmt.Sscanf(line, "%d-%d", &first, &second)
+		if err != nil {
+			return err
+		}
+		c.cpus = iterateAndAppend(first, second, c.cpus)
 	}
 
-	file, err := os.Open(CpuPath)
-	if err != nil {
-		return err
-	}
-	defer file.Close()
-
-	sc := bufio.NewScanner(file)
-	sc.Scan()
-	line := sc.Text()
-	_, err = fmt.Sscanf(line, "%d-%d", &first, &last)
-	if err != nil {
-		return err
-	}
-	for i := first; i <= last; i++ {
-		c.cpus = append(c.cpus, i)
+	// discard cpu 0
+	if c.cpus[0] == 0 && !*UseCpu0 {
+		c.cpus = c.cpus[1:]
 	}
 	return nil
 }
 
 func CpuAllocator() (*CpuAllocatorT, error) {
 	if cpuAllocator == nil {
+		var err error
 		cpuAllocator = new(CpuAllocatorT)
-		err := cpuAllocator.readCpus()
+		cpuAllocator.maxContainerCount = 4
+		buildNumberStr := os.Getenv("BUILD_NUMBER")
+
+		if buildNumberStr != "" {
+			cpuAllocator.runningInCi = true
+			// get last digit of build number
+			cpuAllocator.buildNumber, err = strconv.Atoi(buildNumberStr[len(buildNumberStr)-1:])
+			if err != nil {
+				return nil, err
+			}
+		}
+		err = cpuAllocator.readCpus()
 		if err != nil {
 			return nil, err
 		}
commit	5d171ebdc21efa4085e2c2130f595d7e0e1d2f59	[log] [tgz]
author	Adrian Villin <avillin@cisco.com>	Mon Jun 17 08:51:27 2024 +0200
committer	Dave Wallace <dwallacelf@gmail.com>	Mon Jul 08 16:27:38 2024 +0000
tree	9b1e967d20e8c83f9780d5a76579db0edd3c5373
parent	75e8e1e948da182dbf4f6b3394f1b7fc44c1403a [diff] [blame]