From c7c6c6aa0cf344aad0ce8802b23750ebbca01ac8 Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Wed, 24 Feb 2016 16:15:27 -0600 Subject: [PATCH 1/2] core: optimize the insert by breaking out instead of looking at useless remaining siblings When we found where we should be inserted, we have to look at next sibling in case they should become children. No need to go till the last children: Once the first cpuset bit of a children is above out last, we're done. Mostly useful when inserting in reverse-cpuset-order. --- hwloc/topology.c | 46 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/hwloc/topology.c b/hwloc/topology.c index 066c06dd48..2efcd982f9 100644 --- a/hwloc/topology.c +++ b/hwloc/topology.c @@ -979,6 +979,38 @@ hwloc__object_cpusets_compare_first(hwloc_obj_t obj1, hwloc_obj_t obj2) return hwloc_bitmap_compare_first(obj1->cpuset, obj2->cpuset); } +/* Compare object that are already known as DIFFERENT. + * See how they are actually placed, more precisely than hwloc__object_cpusets_compare_first(). + */ +static int +hwloc__object_cpusets_compare_different(hwloc_obj_t obj1, hwloc_obj_t obj2) +{ + hwloc_bitmap_t set1, set2; + unsigned first1, last1, first2, last2; + if (obj1->complete_cpuset && obj2->complete_cpuset) { + set1 = obj1->complete_cpuset; + set2 = obj2->complete_cpuset; + } else { + set1 = obj1->cpuset; + set2 = obj2->cpuset; + } + first1 = hwloc_bitmap_first(set1); + last1 = hwloc_bitmap_last(set1); + first2 = hwloc_bitmap_first(set2); + last2 = hwloc_bitmap_last(set2); + + if (last1 < first2) + /* 1 is entirely before 2 */ + return -2; + if (first1 > last2) + /* 2 is entirely before 1 */ + return 2; + /* there's some overlap (without intersection since DIFFERENT). + * just return the comparison of first1 and first2 for basic ordering. + */ + return first1 < first2 ? -1 : first1 > first2 ? 1 : 0; +} + /* format the obj info to print in error messages */ static void hwloc__report_error_format_obj(char *buf, size_t buflen, hwloc_obj_t obj) @@ -1185,14 +1217,19 @@ hwloc___insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t cur } goto putback; - case HWLOC_OBJ_DIFFERENT: - /* OBJ should be a child of CUR before CHILD, mark its position if not found yet. */ - if (!putp && hwloc__object_cpusets_compare_first(obj, child) < 0) - /* Don't insert yet, there could be intersect errors later */ + case HWLOC_OBJ_DIFFERENT: { + /* OBJ should be a child of CUR before CHILD */ + int comp = hwloc__object_cpusets_compare_different(obj, child); + /* mark its position if not found yet. */ + if (!putp && comp < 0) putp = cur_children; + if (comp == -2) + /* Only insert if we're sure we won't intersect anything other child */ + goto insert; /* Advance cur_children. */ cur_children = &child->next_sibling; break; + } case HWLOC_OBJ_CONTAINS: /* OBJ contains CHILD, remove CHILD from CUR */ @@ -1212,6 +1249,7 @@ hwloc___insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t cur /* Put OBJ where it belongs, or in last in CUR's children. */ if (!putp) putp = cur_children; + insert: obj->next_sibling = *putp; *putp = obj; obj->parent = cur; From a35395d638c73eda3c80b05d75aa4aa64b1b4fcb Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Wed, 24 Feb 2016 16:18:37 -0600 Subject: [PATCH 2/2] synthetic: insert object in reverse cpuset order to greatly optimize creation time The core inserts new objects within children by walking the existing children list from left to right. So insert the left children last is more efficient since it doesn't walk the entire list anymore. Otherwise, the overall complexity is quadratic with the number of children below objects. lstopo -i "pu:10000" goes down from 20s to 0.8s on my laptop. lstopo -i "100 100" is only improved by 1-2%. On real/existing platforms (KNL or 768pu-AltixUV), the impact seems negligible. --- hwloc/topology-synthetic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hwloc/topology-synthetic.c b/hwloc/topology-synthetic.c index de370df11a..3192720aa7 100644 --- a/hwloc/topology-synthetic.c +++ b/hwloc/topology-synthetic.c @@ -808,7 +808,7 @@ hwloc__look_synthetic(struct hwloc_topology *topology, break; } - os_index = curlevel->next_os_index++; + os_index = curlevel->totalwidth - (++curlevel->next_os_index); if (curlevel->index_array) os_index = curlevel->index_array[os_index]; else if (hwloc_obj_type_is_cache(type) || type == HWLOC_OBJ_GROUP)