StatisticalMeasurement.java
/*
* Copyright (c) 2001-2024, Jean Tessier
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* * Neither the name of Jean Tessier nor the names of his contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.jeantessier.metrics;
import org.apache.logging.log4j.*;
import java.io.*;
import java.text.*;
import java.util.*;
import java.util.regex.*;
/**
* <p>Computes the statistical properties of a given measurement across the
* submetrics of the measurement's context. Given a measurement name, it
* explores the tree of metrics rooted at the context and finds the numerical
* value of these named measurements in the tree. For these measurements, it
* computes:</p>
*
* <ul>
* <li>minimum value</li>
* <li>median value</li>
* <li>average value</li>
* <li>standard deviation</li>
* <li>maximum value</li>
* <li>sum</li>
* <li>number of data points</li>
* </ul>
*
* <p>Most of these statistical values are self-descriptive. There is a
* subtlety regarding the <i>median</i> value, though. If the number of data
* points is odd, we can sort them and return the single value in the middle.
* For the dataset <code>[1, 2, <b>3</b>, 4, 5]</code>, the median is
* <code>3</code>. But if the number of data points is even, there is no
* single value in the middle. Instead, we return the average of the two
* values in the middle of the sorted dataset. For the dataset
* <code>[1, 2, <b>3, 4</b>, 5, 6]</code>, the median is the average of
* <code>3</code> and <code>4</code>, so the median value is <code>3.5</code>.
* This is in accordance with the definition of
* <a href="https://en.wikipedia.org/wiki/Median#Finite_set_of_numbers" target="_top">median</a>
* on Wikipedia.</p>
*
* <p>This is the syntax for initializing this type of measurement:</p>
*
* <pre>
* <init>
* monitored measurement name [DISPOSE_x]
* [DISPOSE_x]
* </init>
* </pre>
*
* <p>If the monitored measurement is itself a statistical measurement, the
* disposition indicates how to deal with it, which of its values to use in
* this measurement's calculation. The default is {@link #DISPOSE_IGNORE},
* meaning it should skip the statistical measurement and dig further in
* sub-submetrics for more raw values.</p>
*
* <p>The second disposition tells which internal value to return in calls to
* its {@link #compute} method, which will be used by clients that do not
* distinguish between StatisticalMeasurement and other Measurements. The
* default is {@link #DISPOSE_AVERAGE}.</p>
*/
public class StatisticalMeasurement extends MeasurementBase {
private static final NumberFormat valueFormat = new DecimalFormat("#.##");
private static final Pattern PERCENTILE_LINE_REGEX = Pattern.compile("\\s*P\\s+\\d+.*", Pattern.CASE_INSENSITIVE);
private static final Pattern PERCENTILE_REGEX = Pattern.compile("(\\d+)");
/** Ignore StatisticalMeasurements and drill down to the next level */
public static final int DISPOSE_IGNORE = 0;
/** Use Minimum() value on StatisticalMeasurements */
public static final int DISPOSE_MINIMUM = 1;
/** Use Median() value on StatisticalMeasurements */
public static final int DISPOSE_MEDIAN = 2;
/** Use Average() value on StatisticalMeasurements */
public static final int DISPOSE_AVERAGE = 3;
/** Use StandardDeviation() value on StatisticalMeasurements */
public static final int DISPOSE_STANDARD_DEVIATION = 4;
/** Use Maximum() value on StatisticalMeasurements */
public static final int DISPOSE_MAXIMUM = 5;
/** Use Sum() value on StatisticalMeasurements */
public static final int DISPOSE_SUM = 6;
/** Use NbDataPoints() value on StatisticalMeasurements */
public static final int DISPOSE_NB_DATA_POINTS = 7;
public static String getDisposeLabel(int dispose) {
return switch (dispose) {
case DISPOSE_MINIMUM -> "minimum";
case DISPOSE_MEDIAN -> "median";
case DISPOSE_AVERAGE -> "average";
case DISPOSE_STANDARD_DEVIATION -> "standard deviation";
case DISPOSE_MAXIMUM -> "maximum";
case DISPOSE_SUM -> "sum";
case DISPOSE_NB_DATA_POINTS -> "number of data points";
default -> "";
};
}
public static String getDisposeAbbreviation(int dispose) {
return switch (dispose) {
case DISPOSE_MINIMUM -> "min";
case DISPOSE_MEDIAN -> "med";
case DISPOSE_AVERAGE -> "avg";
case DISPOSE_STANDARD_DEVIATION -> "sdv";
case DISPOSE_MAXIMUM -> "max";
case DISPOSE_SUM -> "sum";
case DISPOSE_NB_DATA_POINTS -> "nb";
default -> "";
};
}
public static List<Integer> parseRequestedPercentiles(String initText) throws IOException {
try (var in = new BufferedReader(new StringReader(initText))) {
return in.lines()
.filter(line -> PERCENTILE_LINE_REGEX.matcher(line).matches())
.flatMap(line -> PERCENTILE_REGEX.matcher(line).results())
.map(matchResult -> matchResult.group(1))
.map(Integer::valueOf)
.toList();
}
}
public static int countValues(String initText) {
var result = 7;
try {
result += parseRequestedPercentiles(initText).size();
} catch (IOException e) {
// Ignore
}
return result;
}
private String monitoredMeasurement;
private int dispose;
private int selfDispose;
private List<Double> data = new LinkedList<>();
private double minimum = 0.0;
private double median = 0.0;
private double average = 0.0;
private double standardDeviation = 0.0;
private double maximum = 0.0;
private double sum = 0.0;
private int nbDataPoints = 0;
private List<Integer> requestedPercentiles = new LinkedList<>();
private int nbSubmetrics = -1;
public StatisticalMeasurement(MeasurementDescriptor descriptor, Metrics context, String initText) {
super(descriptor, context, initText);
try (var in = new BufferedReader(new StringReader(initText))) {
monitoredMeasurement = in.readLine().trim();
synchronized (perl()) {
if (perl().match("/(.*)\\s+(dispose_\\w+)$/i", monitoredMeasurement)) {
monitoredMeasurement = perl().group(1);
String disposeText = perl().group(2);
dispose = switch (disposeText.toUpperCase()) {
case "DISPOSE_IGNORE" -> DISPOSE_IGNORE;
case "DISPOSE_MINIMUM" -> DISPOSE_MINIMUM;
case "DISPOSE_MEDIAN" -> DISPOSE_MEDIAN;
case "DISPOSE_AVERAGE" -> DISPOSE_AVERAGE;
case "DISPOSE_STANDARD_DEVIATION" -> DISPOSE_STANDARD_DEVIATION;
case "DISPOSE_MAXIMUM" -> DISPOSE_MAXIMUM;
case "DISPOSE_SUM" -> DISPOSE_SUM;
case "DISPOSE_NB_DATA_POINTS" -> DISPOSE_NB_DATA_POINTS;
default -> {
LogManager.getLogger(getClass()).error("Unknown dispose value \"{}\" for monitored measurement \"{}\" of measurement \"{}\", defaulting to DISPOSE_IGNORE", disposeText, monitoredMeasurement, descriptor.getLongName());
yield DISPOSE_IGNORE;
}
};
} else {
dispose = DISPOSE_IGNORE;
}
}
String selfDisposeText = in.readLine();
if (selfDisposeText != null && perl().match("/(dispose_\\w+)/i", selfDisposeText)) {
selfDisposeText = selfDisposeText.trim();
selfDispose = switch (selfDisposeText.toUpperCase()) {
case "DISPOSE_IGNORE" -> DISPOSE_IGNORE;
case "DISPOSE_MINIMUM" -> DISPOSE_MINIMUM;
case "DISPOSE_MEDIAN" -> DISPOSE_MEDIAN;
case "DISPOSE_AVERAGE" -> DISPOSE_AVERAGE;
case "DISPOSE_STANDARD_DEVIATION" -> DISPOSE_STANDARD_DEVIATION;
case "DISPOSE_MAXIMUM" -> DISPOSE_MAXIMUM;
case "DISPOSE_SUM" -> DISPOSE_SUM;
case "DISPOSE_NB_DATA_POINTS" -> DISPOSE_NB_DATA_POINTS;
default -> {
LogManager.getLogger(getClass()).error("Unknown self-dispose value \"{}\" for measurement \"{}\", defaulting to DISPOSE_AVERAGE", selfDisposeText, descriptor.getLongName());
yield DISPOSE_AVERAGE;
}
};
} else {
selfDispose = DISPOSE_AVERAGE;
}
requestedPercentiles = parseRequestedPercentiles(initText);
} catch (Exception ex) {
LogManager.getLogger(getClass()).error("Cannot initialize \"{}\" with init text \"{}\"", descriptor.getLongName(), initText, ex);
monitoredMeasurement = null;
}
}
public double getMinimum() {
collectData();
return minimum;
}
public double getMedian() {
collectData();
return median;
}
public double getAverage() {
collectData();
return average;
}
/**
* Real standard deviation of the data set.
* This is NOT the estimator "s".
*/
public double getStandardDeviation() {
collectData();
return standardDeviation;
}
public double getMaximum() {
collectData();
return maximum;
}
public double getSum() {
collectData();
return sum;
}
public int getNbDataPoints() {
collectData();
return nbDataPoints;
}
public List<Integer> getRequestedPercentiles() {
return requestedPercentiles;
}
public double getPercentile(int percentile) {
collectData();
if (data.isEmpty()) {
return Double.NaN;
}
int pos = (int) Math.ceil((percentile / 100.0) * data.size()) - 1;
return data.get(pos);
}
private void collectData() {
if (getContext().getSubMetrics().size() != nbSubmetrics) {
synchronized (this) {
if (getContext().getSubMetrics().size() != nbSubmetrics) {
data = new LinkedList<>();
setEmpty(true);
getContext().getSubMetrics().forEach(this::visitMetrics);
if (data.isEmpty()) {
minimum = Double.NaN;
median = Double.NaN;
maximum = Double.NaN;
} else {
Collections.sort(data);
minimum = data.get(0);
maximum = data.get(data.size() - 1);
if (data.size() % 2 == 0) {
// Even-sized list, average the two middle values
int pos = data.size() / 2;
double leftMiddleElement = data.get(pos - 1);
double rightMiddleElement = data.get(pos);
median = (leftMiddleElement + rightMiddleElement) / 2;
} else {
// Odd-sized list, use the middle value
median = data.get(data.size() / 2);
}
}
nbDataPoints = data.size();
sum = data.stream()
.reduce(Double::sum)
.orElse(Double.NaN);
average = sum / nbDataPoints;
if (data.isEmpty()) {
standardDeviation = Double.NaN;
} else {
var temp = data.parallelStream()
.map(n -> Math.pow(n - average, 2))
.reduce(Double::sum)
.orElse(0.0);
standardDeviation = Math.sqrt(temp / nbDataPoints);
}
nbSubmetrics = getContext().getSubMetrics().size();
}
}
}
}
private void visitMetrics(Metrics metrics) {
LogManager.getLogger(getClass()).debug("VisitMetrics: {}", metrics.getName());
Measurement measurement = metrics.getMeasurement(monitoredMeasurement);
LogManager.getLogger(getClass()).debug("measurement for {} is {}", monitoredMeasurement, measurement.getClass());
if (measurement instanceof StatisticalMeasurement stats) {
LogManager.getLogger(getClass()).debug("dispose of StatisticalMeasurements is {}", dispose);
switch (dispose) {
case DISPOSE_MINIMUM -> {
LogManager.getLogger(getClass()).debug("using Minimum(): {}", stats.getMinimum());
data.add(stats.getMinimum());
}
case DISPOSE_MEDIAN -> {
LogManager.getLogger(getClass()).debug("using Median(): {}", stats.getMedian());
data.add(stats.getMedian());
}
case DISPOSE_AVERAGE -> {
LogManager.getLogger(getClass()).debug("using Average(): {}", stats.getAverage());
data.add(stats.getAverage());
}
case DISPOSE_STANDARD_DEVIATION -> {
LogManager.getLogger(getClass()).debug("using StandardDeviation(): {}", stats.getStandardDeviation());
data.add(stats.getStandardDeviation());
}
case DISPOSE_MAXIMUM -> {
LogManager.getLogger(getClass()).debug("using Maximum(): {}", stats.getMaximum());
data.add(stats.getMaximum());
}
case DISPOSE_SUM -> {
LogManager.getLogger(getClass()).debug("using Sum(): {}", stats.getSum());
data.add(stats.getSum());
}
case DISPOSE_NB_DATA_POINTS -> {
LogManager.getLogger(getClass()).debug("using NbDataPoints(): {}", stats.getNbDataPoints());
data.add((double) stats.getNbDataPoints());
}
default -> {
LogManager.getLogger(getClass()).debug("Skipping to next level ...");
metrics.getSubMetrics().forEach(this::visitMetrics);
}
}
} else if (measurement instanceof NullMeasurement) {
LogManager.getLogger(getClass()).debug("Skipping to next level ...");
metrics.getSubMetrics().forEach(this::visitMetrics);
} else {
Number value = measurement.getValue();
LogManager.getLogger(getClass()).debug("{} on {} is {}", monitoredMeasurement, metrics.getName(), value);
if (value != null) {
data.add(value.doubleValue());
}
}
if (super.isEmpty()) {
setEmpty(measurement.isEmpty());
}
}
public boolean isEmpty() {
collectData();
return super.isEmpty();
}
public void accept(MeasurementVisitor visitor) {
visitor.visitStatisticalMeasurement(this);
}
protected double compute() {
return switch (selfDispose) {
case DISPOSE_MINIMUM -> getMinimum();
case DISPOSE_MEDIAN -> getMedian();
case DISPOSE_AVERAGE -> getAverage();
case DISPOSE_STANDARD_DEVIATION -> getStandardDeviation();
case DISPOSE_MAXIMUM -> getMaximum();
case DISPOSE_SUM -> getSum();
case DISPOSE_NB_DATA_POINTS -> getNbDataPoints();
default -> Double.NaN;
};
}
public String toString() {
return "[" + valueFormat.format(getMinimum()) +
" " + valueFormat.format(getMedian()) +
"/" + valueFormat.format(getAverage()) +
" " + valueFormat.format(getStandardDeviation()) +
" " + valueFormat.format(getMaximum()) +
" " + valueFormat.format(getSum()) +
" (" + valueFormat.format(getNbDataPoints()) + ")]";
}
}