diff --git a/libs_required/ACG/Utils/Histogram.cc b/libs_required/ACG/Utils/Histogram.cc index 500badb5936a09f71b4e8f2f0f2ff6a67035e864..69a56baf7959635acaec90d4c2e444f7b56110ac 100644 --- a/libs_required/ACG/Utils/Histogram.cc +++ b/libs_required/ACG/Utils/Histogram.cc @@ -39,14 +39,4 @@ * * \*===========================================================================*/ -#include "Histogram.hh" - -namespace ACG { - -template<> -QString HistogramT<double>::getBoundaryLabel(size_t idx) const { - // TODO: choose accuracy based on avg_bin_size_ - return QString::number(bin_boundaries_[idx], 'g', 2); -} - -} // namespace ACG +//#include "Histogram.hh" diff --git a/libs_required/ACG/Utils/Histogram.hh b/libs_required/ACG/Utils/Histogram.hh index aeb545330b4f0cd8f9a64107fcb797afd178c247..90bd93eb03037a9e3c61e4515a259a80ac2e3441 100644 --- a/libs_required/ACG/Utils/Histogram.hh +++ b/libs_required/ACG/Utils/Histogram.hh @@ -39,8 +39,7 @@ * * \*===========================================================================*/ -#ifndef ACG_HISTOGRAM_HH -#define ACG_HISTOGRAM_HH +#pragma once #include <vector> #include <cassert> @@ -48,9 +47,11 @@ #include <exception> #include <algorithm> #include <type_traits> +#include <map> #include <QString> +#include "SmartPointer.hh" #include "../Config/ACGDefines.hh" namespace ACG { @@ -61,6 +62,13 @@ public: PerBoundary, }; + Histogram() = default; + Histogram(std::vector<size_t> &&bins, + std::vector<double> &&bin_widths) + : bins_(std::move(bins)), + bin_widths_(std::move(bin_widths)) + {} + virtual ~Histogram() = default; const std::vector<size_t> &getBins() const { return bins_; } const std::vector<double> &getBinWidths() const { return bin_widths_; } @@ -76,83 +84,50 @@ protected: }; -// we need to be careful with ranges, some sums (e.g. INT_MAX - INT_MIN) do not fit into a signed int, -// so we store bin sizes as doubles. With specialization or some tricks we -// could probably use the next-biggest integer type, but if we're using -// the biggest integer type already, we should to fall back to double anyways. +inline QString formatValue (int val) { + return QString::number(val); +} +inline QString formatValue (unsigned int val) { + return QString::number(val); +} +inline QString formatValue (double val) { + return QString::number(val, 'g', 3); +} + +template<typename T> +class UnbinnedHistogram : public Histogram +{ +public: + UnbinnedHistogram(std::vector<size_t> &&bin_counts, + std::vector<T> &&bin_values) + : Histogram(std::move(bin_counts), + std::vector<double>(bin_counts.size(), 1.)), + bin_values_(std::move(bin_values)) + { + } + double getTotalWidth() const override { return bins_.size();}; + LabelType getLabelType() const override { return LabelType::PerBin; }; + QString getBinLabel (size_t idx) const override { return formatValue(bin_values_[idx]);} +private: + std::vector<T> bin_values_; +}; template<typename T> class HistogramT : public Histogram { public: - HistogramT(const std::vector<int> &histogram, - const std::vector<T> &bin_boundaries, - const std::vector<double> &bin_widths) + HistogramT() = default; + + HistogramT(std::vector<size_t> &&histogram, + std::vector<T> &&bin_boundaries, + std::vector<double> &&bin_widths + ) + : Histogram(std::move(histogram), std::move(bin_widths)), + bin_boundaries_(std::move(bin_boundaries)) { if (bins_.size() != bin_widths_.size() || bins_.size() + 1 != bin_boundaries_.size()) { throw std::runtime_error("Histogram constructor sizes don't match."); } - bins_ = histogram; - bin_boundaries_ = bin_boundaries; - bin_widths_ = bin_widths; - double range = bin_boundaries.back() - bin_boundaries.front(); - avg_bin_size_ = range / bins_.size(); - } - - template<typename IterT> - HistogramT(IterT begin, IterT end, size_t max_bins) - { - static_assert(std::is_assignable<T&, typename IterT::value_type>::value, "IterT incompatible with T."); - static_assert(std::is_floating_point<typename IterT::value_type>::value, "HistogramT currently only supports floating point values."); - assert(max_bins > 0); - const size_t n = std::distance(begin, end); - if (n == 0) return; - - const auto minmax = std::minmax_element(begin, end); - const T min = *minmax.first; - const T max = *minmax.second; - const double min_dbl = static_cast<double>(min); - const double range = static_cast<double>(max) - min_dbl; - - const size_t n_bins_max = std::min(max_bins, n); - bin_boundaries_.reserve(n_bins_max + 1); - - T last_boundary = min; - bin_boundaries_.push_back(min); - for (size_t i = 1; i < n_bins_max; ++i) { - // Adding range/n_bins to a accumulator might seem more efficient/elegant, - // but might cause numeric issues. - - // This multiplication order is bad for huge ranges that cause overflows, - // however I assume tiny ranges are more common than huge values and more - // important to get right. If you disagree, add a case distinction or something better. - - T boundary = static_cast<T>(min + (i * range) / n_bins_max); - // avoid zero-sized bins (happens for many ints with values in a small range) - if (boundary != last_boundary || i == 0) { - bin_boundaries_.push_back(boundary); - bin_widths_.push_back(boundary - last_boundary); - } - last_boundary = boundary; - } - bin_boundaries_.push_back(max); // avoid rounding issues etc by explicitly picking max. - bin_widths_.push_back(max - last_boundary); - - bin_boundaries_.shrink_to_fit(); - size_t n_bins = bin_boundaries_.size() - 1; - bins_.resize(n_bins); - - // note that due to rounding, our bins may have differing sizes, which matters - // if we handle integral types (relative size difference worst case: bin width 1 vs 2). - // Be careful to select the right bin. - std::for_each(begin, end, [&](const T &val) { - auto it = std::upper_bound(bin_boundaries_.begin(), bin_boundaries_.end(), val); - if (it == bin_boundaries_.end()) --it; // the last value is exactly max! - size_t idx = std::distance(bin_boundaries_.begin(), it); - assert(idx > 0); - ++bins_[idx - 1]; - }); - avg_bin_size_ = range / n_bins; } const std::vector<T> &getBinBoundaries() const { @@ -169,20 +144,125 @@ public: return LabelType::PerBoundary; } - QString getBoundaryLabel (size_t idx) const override; + QString getBoundaryLabel (size_t idx) const override + { + // TODO: for floating point types, choose accuracy depending on bin size + return formatValue(bin_boundaries_[idx]); + }; private: std::vector<T> bin_boundaries_; - double avg_bin_size_ = 0.0; }; template<typename T> -QString HistogramT<T>::getBoundaryLabel(size_t idx) const { - return QString::number(bin_boundaries_[idx]); +std::unique_ptr<Histogram> create_histogram_unbinned(const std::map<T, size_t> &counts) +{ + std::vector<T> values; + std::vector<size_t> histogram; + values.reserve(counts.size()); + histogram.reserve(counts.size()); + for (const auto &entry: counts) + { + values.push_back(entry.first); + histogram.push_back(entry.second); + } + return ptr::make_unique<UnbinnedHistogram<T>>(std::move(histogram), std::move(values)); +} + +template<typename T, typename Iterable> +std::unique_ptr<Histogram> create_histogram_autorange(const Iterable &range, size_t max_bins = 50) +{ +// we need to be careful with ranges, some sums (e.g. INT_MAX - INT_MIN) do not fit into a signed int, +// so we store bin sizes as doubles. With specialization or some tricks we +// could probably use the next-biggest integer type, but if we're using +// the biggest integer type already, we should to fall back to double anyways. + + + std::vector<T> bin_boundaries; + std::vector<size_t> bins; + std::vector<double> bin_widths; + + const size_t n = std::distance(begin(range), end(range)); + if (n == 0) return {}; + const auto minmax = std::minmax_element(begin(range), end(range)); + const T min = *minmax.first; + const T max = *minmax.second; + + const double min_dbl = static_cast<double>(min); + const double val_range = static_cast<double>(max) - min_dbl; + + const size_t n_bins_max = std::min(max_bins, n); + bin_boundaries.reserve(n_bins_max + 1); + + T last_boundary = min; + bin_boundaries.push_back(min); + for (size_t i = 1; i < n_bins_max; ++i) { + // Adding val_range/n_bins to a accumulator might seem more efficient/elegant, + // but might cause numeric issues. + + // This multiplication order is bad for huge ranges that cause overflows, + // however I assume tiny ranges are more common than huge values and more + // important to get right. If you disagree, add a case distinction or something better. + + T boundary = static_cast<T>(min + (i * val_range) / n_bins_max); + // avoid zero-sized bins (happens for many ints with values in a small range) + if (boundary != last_boundary || i == 0) { + bin_boundaries.push_back(boundary); + bin_widths.push_back(boundary - last_boundary); + } + last_boundary = boundary; + } + bin_boundaries.push_back(max); // avoid rounding issues etc by explicitly picking max. + bin_widths.push_back(max - last_boundary); + + bin_boundaries.shrink_to_fit(); + size_t n_bins = bin_boundaries.size() - 1; + bins.resize(n_bins); + + // note that due to rounding, our bins may have differing sizes, which matters + // if we handle integral types (relative size difference worst case: bin width 1 vs 2). + // Be careful to select the right bin. + std::for_each(begin(range), end(range), [&](const T &val) { + auto it = std::upper_bound(bin_boundaries.begin(), bin_boundaries.end(), val); + if (it == bin_boundaries.end()) --it; // the last value is exactly max! + size_t idx = std::distance(bin_boundaries.begin(), it); + assert(idx > 0); + ++bins[idx - 1]; + }); + return ptr::make_unique<HistogramT<T>>(std::move(bins), std::move(bin_boundaries), std::move(bin_widths)); +} + +template<typename Iterable> +std::unique_ptr<Histogram> create_histogram_auto(const Iterable &range, size_t max_bins = 50) +{ + using T = typename std::remove_cv< + typename std::remove_reference< + decltype(*begin(range)) + >::type + >::type; + const size_t n = std::distance(begin(range), end(range)); + if (n == 0) return {}; + + std::map<T, size_t> elem_counts; + bool too_many_unique = false; + for (const auto &v: range) + { + ++elem_counts[v]; + if (elem_counts.size() > max_bins) + { + too_many_unique = true; + break; + } + } + + if (too_many_unique) { + return create_histogram_autorange<T>(range, max_bins); + } else { + return create_histogram_unbinned(elem_counts); + } } } // namespace ACG -#endif // ACG_HISTOGRAM_HH