From 41e5fce706a5881d820f9ca9d7e1a364bcc9b13d Mon Sep 17 00:00:00 2001 From: Daniel Seemaier Date: Thu, 12 Oct 2023 10:26:13 +0200 Subject: [PATCH] Drop MPI support for graphstats, restructure it, add functionality to count certain degrees --- app/tools/graphstats.cpp | 219 +++++++++++++++++++++++---------------- 1 file changed, 132 insertions(+), 87 deletions(-) diff --git a/app/tools/graphstats.cpp b/app/tools/graphstats.cpp index 9c8d286..922b179 100644 --- a/app/tools/graphstats.cpp +++ b/app/tools/graphstats.cpp @@ -14,31 +14,39 @@ using namespace kagen; +struct Configuration { + std::vector input_filenames; + FileFormat input_format = FileFormat::EXTENSION; + + int num_chunks = 1; + bool header_only = false; + bool omit_header = false; + bool strip_extension = false; + + std::vector count_num_deg_nodes; +}; + struct Statistics { std::string name; - SInt n; - SInt m; - SInt min_deg; - LPFloat avg_deg; - SInt max_deg; -}; -Graph LoadGraph(const PGeneratorConfig& config) { - const PEID rank = GetCommRank(MPI_COMM_WORLD); - const PEID size = GetCommSize(MPI_COMM_WORLD); + SInt n = 0; + SInt m = 0; + SInt min_deg = 0; + LPFloat avg_deg = 0.0; + SInt max_deg = 0; - FileGraphFactory factory; - const auto normalized_config = factory.NormalizeParameters(config, rank, size, false); - auto loader = factory.Create(normalized_config, rank, size); - loader->Generate(GraphRepresentation::EDGE_LIST); - loader->Finalize(MPI_COMM_WORLD); - return loader->Take(); -} + std::vector num_deg_nodes; +}; + +void PrintHeader(const Configuration& config) { + ((void)config); -void PrintHeader() { std::cout << "Graph,"; std::cout << "N,"; std::cout << "M,"; + for (const SInt deg: config.count_num_deg_nodes) { + std::cout << "NumDeg" << deg << "Nodes,"; + } std::cout << "MinDeg,"; std::cout << "AvgDeg,"; std::cout << "MaxDeg"; @@ -49,119 +57,156 @@ void PrintRow(const Statistics& stats) { std::cout << stats.name << ","; std::cout << stats.n << ","; std::cout << stats.m << ","; + for (const SInt num_deg_nodes: stats.num_deg_nodes) { + std::cout << num_deg_nodes << ","; + } std::cout << stats.min_deg << ","; std::cout << stats.avg_deg << ","; std::cout << stats.max_deg; std::cout << std::endl; } -Statistics GenerateInternal(const PGeneratorConfig& config) { - Graph graph = LoadGraph(config); +struct StatisticsComputator { + StatisticsComputator(const Configuration& config) : config_(config) {} - Statistics stats; - stats.n = FindNumberOfGlobalNodes(graph.vertex_range, MPI_COMM_WORLD); - stats.m = FindNumberOfGlobalEdges(graph.edges, MPI_COMM_WORLD); + void operator()(const Graph& chunk) { + stats_.m += chunk.edges.size(); - const auto degree_stats = ReduceDegreeStatistics(graph.edges, stats.n, MPI_COMM_WORLD); - stats.min_deg = degree_stats.min; - stats.avg_deg = degree_stats.mean; - stats.max_deg = degree_stats.max; + for (const auto& [from, to]: chunk.edges) { + while (degrees_.size() <= from) { + degrees_.push_back(0); + } + ++degrees_[from]; + } + } - return stats; -} + Statistics Finalize(const Graph& graph) { + FinalizeStreamingStatistics(); + return std::move(stats_); + } -Statistics GenerateExternal(const PGeneratorConfig& config, const int num_chunks) { - if (GetCommSize(MPI_COMM_WORLD) > 1) { - std::cerr << "Error: external statistics generation is only supported for a single MPI process\n"; - std::exit(1); + Statistics Finalize() { + FinalizeStreamingStatistics(); + return std::move(stats_); } - Statistics stats; +private: + void FinalizeStreamingStatistics() { + stats_.n = degrees_.size(); + FinalizeDegreeStatistics(); + } - const auto reader = CreateGraphReader(config.input_graph.format, config.input_graph, 0, 1); - auto reported_size = reader->ReadSize(); + void FinalizeDegreeStatistics() { + stats_.min_deg = std::numeric_limits::max(); + stats_.max_deg = std::numeric_limits::min(); - std::vector degrees; + stats_.num_deg_nodes.resize(config_.count_num_deg_nodes.size()); + std::fill(stats_.num_deg_nodes.begin(), stats_.num_deg_nodes.end(), 0); - for (int chunk = 0; chunk < num_chunks; ++chunk) { - const auto [from, to] = ComputeRange(reported_size.first, num_chunks, chunk); - Graph graph = reader->Read(from, to, std::numeric_limits::max(), GraphRepresentation::EDGE_LIST); + for (SInt node = 0; node < stats_.n; ++node) { + const SInt deg = degrees_[node]; + stats_.min_deg = std::min(stats_.min_deg, deg); + stats_.max_deg = std::max(stats_.max_deg, deg); - for (const auto& [from, to]: graph.edges) { - while (degrees.size() <= from) { - degrees.push_back(0); + for (std::size_t i = 0; i < config_.count_num_deg_nodes.size(); ++i) { + stats_.num_deg_nodes[i] += (deg == config_.count_num_deg_nodes[i]); } - ++degrees[from]; } - stats.m += graph.edges.size(); + stats_.avg_deg = 1.0 * stats_.m / stats_.n; } - const auto [min_it, max_it] = std::minmax_element(degrees.begin(), degrees.end()); + const Configuration& config_; - stats.n = degrees.size(); - stats.min_deg = *min_it; - stats.avg_deg = 1.0 * stats.m / stats.n; - stats.max_deg = *max_it; + std::vector degrees_; - return stats; -} + Statistics stats_; +}; -int main(int argc, char* argv[]) { - MPI_Init(&argc, &argv); +Statistics ComputeStatistics(const Configuration& stats_config, const PGeneratorConfig& kagen_config) { + StatisticsComputator computator(stats_config); - std::vector input_filenames; - bool do_strip_extension = false; - bool do_no_header = false; - bool do_header_only = false; - int num_chunks = 1; - PGeneratorConfig config; + auto reader = CreateGraphReader(kagen_config.input_graph.format, kagen_config.input_graph, 0, 1); + + Graph graph = + ReadGraph(*reader, GraphRepresentation::EDGE_LIST, kagen_config.input_graph, 0, stats_config.num_chunks); + computator(graph); - CLI::App app("graphstats: compute some basic statistics on a graph"); + for (int chunk = 1; chunk < stats_config.num_chunks; ++chunk) { + const Graph next_graph = ReadGraph( + *reader, GraphRepresentation::EDGE_LIST, kagen_config.input_graph, chunk, stats_config.num_chunks); + computator(next_graph); + } + + if (stats_config.num_chunks == 1) { + graph = FinalizeReadGraph(reader->Deficits(), std::move(graph), false, MPI_COMM_WORLD); + return computator.Finalize(graph); + } else { + return computator.Finalize(); + } +} + +Configuration parse_cli_arguments(int argc, char* argv[]) { + Configuration config; + + CLI::App app("graphstats: compute basic graph statistics"); CLI::Option_group* group = app.add_option_group("Options"); group->require_option(1); - group->add_option("input filenames", input_filenames)->check(CLI::ExistingFile); - group->add_flag("--header-only", do_header_only); + group->add_option("input filenames", config.input_filenames)->check(CLI::ExistingFile); + group->add_flag("--header-only", config.header_only); - app.add_option("-f,--format", config.input_graph.format, "File format of the input file(s).") - ->transform(CLI::CheckedTransformer(GetInputFormatMap())); - app.add_flag( - "--strip-extension", do_strip_extension, - "If set, print the filename in the Graph column without file extension."); - app.add_flag("-H,--no-header", do_no_header, "If set, do not print the CSV header line."); app.add_option( - "-C,--num-chunks", num_chunks, + "-C,--num-chunks", config.num_chunks, "If set, compute the statistics externally by splitting the graph into this many chunks; some statistics might " "not be available in this mode. Still requires O(n) memory."); - CLI11_PARSE(app, argc, argv); + app.add_option("-f,--format", config.input_format, "File format of the input file(s).") + ->transform(CLI::CheckedTransformer(GetInputFormatMap())); + app.add_flag( + "--strip-extension", config.strip_extension, + "If set, print the filename in the Graph column without file extension."); + app.add_flag("-H,--omit-header", config.omit_header, "If set, do not print the CSV header line."); + + app.add_option("--count-degree", config.count_num_deg_nodes, "Count the number of nodes with this degree."); - // Catch special case: only print CSV header line - if ((do_header_only || !do_no_header) && GetCommRank(MPI_COMM_WORLD) == ROOT) { - PrintHeader(); + try { + app.parse(argc, argv); + } catch (const CLI::ParseError& e) { + (app).exit(e); + std::exit(1); } - if (do_header_only) { + + return config; +} + +int main(int argc, char* argv[]) { + MPI_Init(&argc, &argv); + if (GetCommSize(MPI_COMM_WORLD) != 1) { + std::cerr << "must be run with just one MPI process\n"; return MPI_Finalize(); } - for (const auto& filename: input_filenames) { - config.input_graph.filename = filename; + Configuration config = parse_cli_arguments(argc, argv); - Statistics stats; - if (num_chunks == 1) { - stats = GenerateInternal(config); - } else { - stats = GenerateExternal(config, num_chunks); - } + if (config.header_only || !config.omit_header) { + PrintHeader(config); + } + if (config.header_only) { + return MPI_Finalize(); + } + + for (const auto& filename: config.input_filenames) { + PGeneratorConfig kagen_config; + kagen_config.input_graph.filename = filename; + kagen_config.input_graph.format = config.input_format; - stats.name = ExtractFilename(config.input_graph.filename); - if (do_strip_extension) { + Statistics stats = ComputeStatistics(config, kagen_config); + stats.name = ExtractFilename(filename); + if (config.strip_extension) { stats.name = StripExtension(stats.name); } - if (GetCommRank(MPI_COMM_WORLD) == ROOT) { - PrintRow(stats); - } + PrintRow(stats); } return MPI_Finalize();