diff --git a/hwy/contrib/thread_pool/topology.cc b/hwy/contrib/thread_pool/topology.cc index d524120344..06354911de 100644 --- a/hwy/contrib/thread_pool/topology.cc +++ b/hwy/contrib/thread_pool/topology.cc @@ -507,54 +507,13 @@ class Remapper { // For internal use by `DetectPackages`. struct PerPackage { Remapper clusters; + // We might assign sub-NUMA clusters, hence use this to count them. + LogicalProcessorSet cluster_bits; Remapper cores; // We rely on this zero-init and increment it below. uint8_t smt_per_core[kMaxLogicalProcessors] = {0}; }; -// Initializes `lps` and returns a PackageSizes vector (empty on failure) -// indicating the number of clusters and cores per package. -std::vector DetectPackages(std::vector& lps) { - std::vector empty; - - Remapper packages; - for (size_t lp = 0; lp < lps.size(); ++lp) { - if (!packages(kPackage, lp, &lps[lp].package)) { - HWY_WARN("Failed to read sysfs package for LP %zu\n", lp); - return empty; - } - } - std::vector per_package(packages.Num()); - HWY_ASSERT(!per_package.empty()); - - for (size_t lp = 0; lp < lps.size(); ++lp) { - PerPackage& pp = per_package[lps[lp].package]; - // Not a failure: some CPUs lack a (shared) L3 cache. - if (!pp.clusters(kCluster, lp, &lps[lp].cluster)) { - lps[lp].cluster = 0; - } - - if (!pp.cores(kCore, lp, &lps[lp].core)) { - HWY_WARN("Failed to read sysfs core for LP %zu\n", lp); - return empty; - } - - // SMT ID is how many LP we have already seen assigned to the same core. - HWY_ASSERT(lps[lp].core < kMaxLogicalProcessors); - lps[lp].smt = pp.smt_per_core[lps[lp].core]++; - HWY_ASSERT(lps[lp].smt < 16); - } - - std::vector package_sizes(per_package.size()); - for (size_t p = 0; p < package_sizes.size(); ++p) { - // Was zero if the package has no shared L3, see above. - package_sizes[p].num_clusters = HWY_MAX(1, per_package[p].clusters.Num()); - package_sizes[p].num_cores = per_package[p].cores.Num(); - HWY_ASSERT(package_sizes[p].num_cores != 0); - } - return package_sizes; -} - std::vector ExpandList(const char* list, size_t list_end, size_t max_lp) { std::vector expanded; @@ -639,6 +598,84 @@ void SetNodes(std::vector& lps) { } } +// Initializes `lps` and returns a PackageSizes vector (empty on failure) +// indicating the number of clusters and cores per package. +std::vector DetectPackages(std::vector& lps) { + std::vector empty; + + bool all_cluster_0 = true; + + Remapper packages; + for (size_t lp = 0; lp < lps.size(); ++lp) { + if (!packages(kPackage, lp, &lps[lp].package)) { + HWY_WARN("Failed to read sysfs package for LP %zu\n", lp); + return empty; + } + } + std::vector per_package(packages.Num()); + HWY_ASSERT(!per_package.empty()); + + SetNodes(lps); + + for (size_t lp = 0; lp < lps.size(); ++lp) { + PerPackage& pp = per_package[lps[lp].package]; + if (!pp.clusters(kCluster, lp, &lps[lp].cluster)) { + // Not a failure: some CPUs lack a (shared) L3 cache. + lps[lp].cluster = 0; + } + all_cluster_0 &= lps[lp].cluster == 0; + + if (!pp.cores(kCore, lp, &lps[lp].core)) { + HWY_WARN("Failed to read sysfs core for LP %zu\n", lp); + return empty; + } + + // SMT ID is how many LP we have already seen assigned to the same core. + HWY_ASSERT(lps[lp].core < kMaxLogicalProcessors); + lps[lp].smt = pp.smt_per_core[lps[lp].core]++; + HWY_ASSERT(lps[lp].smt < 16); + } + + // Looks like all LPs are in the same cluster: sysfs did not report any + // shared L3 cache. Check for sub-NUMA clusters: + if (all_cluster_0) { + LogicalProcessorSet nodes; + for (const Topology::LP& lp : lps) { + nodes.Set(lp.node); + per_package[lp.package].cluster_bits.Set(lp.cluster); + } + const size_t num_nodes = nodes.Count(); + + const size_t nodes_per_package = num_nodes / per_package.size(); + if (HWY_UNLIKELY(nodes_per_package == 0)) { + HWY_WARN("Why fewer NUMA nodes (%zu) than packages (%zu)?", num_nodes, + per_package.size()); + } + if (nodes_per_package > 1) { + for (PerPackage& pp : per_package) { + pp.cluster_bits = LogicalProcessorSet(); + } + // Sub-NUMA clusters: assign cluster from per-package nodes. + for (Topology::LP& lp : lps) { + HWY_ASSERT(lp.package == lp.node / nodes_per_package); + lp.cluster = lp.node % nodes_per_package; + per_package[lp.package].cluster_bits.Set(lp.cluster); + } + } // else: for traditional NUMA (node == package), cluster remains 0. + } + + std::vector package_sizes(per_package.size()); + for (size_t pkg_idx = 0; pkg_idx < per_package.size(); ++pkg_idx) { + const PerPackage& pp = per_package[pkg_idx]; + HWY_ASSERT(pp.cluster_bits.Count() != 0); + package_sizes[pkg_idx].num_clusters = pp.cluster_bits.Count(); + package_sizes[pkg_idx].num_cores = pp.cores.Num(); + HWY_ASSERT(package_sizes[pkg_idx].num_cores != 0); + } + + return package_sizes; +} + void SetClusterCacheSizes(std::vector& packages) { for (size_t ip = 0; ip < packages.size(); ++ip) { Topology::Package& p = packages[ip]; @@ -725,6 +762,30 @@ size_t MaxCoresPerCluster(const size_t max_lps_per_core, return max_cores_per_cluster; } +// Sets LP.node for all `lps`. +void SetNodes(std::vector& lps) { + // Zero-initialize all nodes in case the below fails. + for (size_t lp = 0; lp < lps.size(); ++lp) { + lps[lp].node = 0; + } + + // We want the full NUMA nodes, but Windows Server 2022 truncates the results + // of `RelationNumaNode` to a single 64-LP group. To get the old, unlimited + // behavior without using the new `RelationNumaNodeEx` symbol, use the old + // `RelationAll` and filter the SLPI we want. + (void)ForEachSLPI(RelationAll, [&](const SLPI& info) { + if (info.Relationship != RelationNumaNode) return; + const NUMA_NODE_RELATIONSHIP& nn = info.NumaNode; + // This field was previously reserved/zero. There is at least one group. + const size_t num_groups = HWY_MAX(1, nn.GroupCount); + const uint8_t node = static_cast(nn.NodeNumber); + ForeachBit(num_groups, nn.GroupMasks, lps, __LINE__, + [node](size_t lp, std::vector& lps) { + lps[lp].node = node; + }); + }); +} + // Initializes `lps` and returns a `PackageSizes` vector (empty on failure) // indicating the number of clusters and cores per package. std::vector DetectPackages(std::vector& lps) { @@ -732,6 +793,8 @@ std::vector DetectPackages(std::vector& lps) { const size_t max_cores_per_cluster = MaxCoresPerCluster(max_lps_per_core, lps); + SetNodes(lps); + std::vector packages; size_t package_idx = 0; (void)ForEachSLPI(RelationProcessorPackage, [&](const SLPI& info) { @@ -753,30 +816,6 @@ std::vector DetectPackages(std::vector& lps) { return packages; } -// Sets LP.node for all `lps`. -void SetNodes(std::vector& lps) { - // Zero-initialize all nodes in case the below fails. - for (size_t lp = 0; lp < lps.size(); ++lp) { - lps[lp].node = 0; - } - - // We want the full NUMA nodes, but Windows Server 2022 truncates the results - // of `RelationNumaNode` to a single 64-LP group. To get the old, unlimited - // behavior without using the new `RelationNumaNodeEx` symbol, use the old - // `RelationAll` and filter the SLPI we want. - (void)ForEachSLPI(RelationAll, [&](const SLPI& info) { - if (info.Relationship != RelationNumaNode) return; - const NUMA_NODE_RELATIONSHIP& nn = info.NumaNode; - // This field was previously reserved/zero. There is at least one group. - const size_t num_groups = HWY_MAX(1, nn.GroupCount); - const uint8_t node = static_cast(nn.NodeNumber); - ForeachBit(num_groups, nn.GroupMasks, lps, __LINE__, - [node](size_t lp, std::vector& lps) { - lps[lp].node = node; - }); - }); -} - #elif HWY_OS_APPLE // Initializes `lps` and returns a `PackageSizes` vector (empty on failure) @@ -812,6 +851,7 @@ std::vector DetectPackages(std::vector& lps) { lps[lp].core = static_cast(lp / lp_per_core); lps[lp].smt = static_cast(lp % lp_per_core); lps[lp].cluster = static_cast(lps[lp].core / cores_per_cluster); + lps[lp].node = 0; // no NUMA } PackageSizes ps; @@ -820,13 +860,6 @@ std::vector DetectPackages(std::vector& lps) { return std::vector{ps}; } -// Sets LP.node for all `lps`. -void SetNodes(std::vector& lps) { - for (size_t lp = 0; lp < lps.size(); ++lp) { - lps[lp].node = 0; // no NUMA - } -} - #endif // HWY_OS_* #if HWY_OS_WIN || HWY_OS_APPLE @@ -857,7 +890,6 @@ HWY_CONTRIB_DLLEXPORT Topology::Topology() { lps.resize(TotalLogicalProcessors()); const std::vector& package_sizes = DetectPackages(lps); if (package_sizes.empty()) return; - SetNodes(lps); // Allocate per-package/cluster/core vectors. This indicates to callers that // detection succeeded.