Source: data/DataFrame.js

// FIXME: remove this to generic factory or similar
import { PangaeaDatasetDescription } from './PangaeaDatasetDescription.js';
import { Statistics } from './Statistics.js';

/**
 * A representation of a date frame with columns, rows and series and optional
 * metadata.
 * 
 * TODO: Including special metadata handling for PANGAEA metadata header. We 
 * should come up with a cleaner class structure.
 * 
 * @author rkoppe <roland.koppe@awi.de>
 */
export class DataFrame {

    description = {};

    columns = [];
    columnMap = {};
    rows = [];
    series = [];

    static number = new RegExp('\d+(\.\d+)?');
    static datetime = new RegExp('\d{4}-\d{2}-\d{2}[T| ]\d{2}:\d{2}:\d{2}');

    constructor() {
    }

    /**
     * Reads the given content into columns, rows and series.
     * 
     * @param {string} content
     * @returns {DataFrame}
     */
    read(content, separator = '\t') {

        // table start position in content
        let start = 0;


        // detect commented header, e.g. PANGAEA
        if (content.startsWith('/*')) {
            let pos = content.indexOf('*/');
            let meta = content.substring(3, pos - 1);
            this.description = new PangaeaDatasetDescription(meta);
            start = pos + 3;
        }


        // "stream" data into rows
        let time = Date.now();

        let count = 0;
        let end = 0;
        do {
            end = content.indexOf('\n', start);
            if (end == -1) end = content.length;
            let line = content.substring(start, end).trim();
            start = end + 1;

            // skip empty lines
            if (line == '') continue;

            let parts = line.split(separator);

            // header
            if (count == 0) {
                this.columns = parts;
                for (let i = 0; i < this.columns.length; i++) {
                    this.series[i] = [];
                    this.series[this.columns[i]] = this.series[i];
                    this.columnMap[this.columns[i]] = i;
                }

                // content
            } else {
                let row = count - 1;

                for (let i = 0; i < this.columns.length; i++) {
                    let value = parts[i] || '';

                    // parse
                    if ((value != '') && (!isNaN(value))) {
                        value = parseFloat(value);
                        parts[i] = value;
                    }

                    this.series[i][row] = value;
                }
                this.rows[row] = parts;
            }

            count++;
        } while (end < content.length);

        time = Date.now() - time;
        console.log('content parsed in ' + time + 'ms');

        return this;
    }

    /**
     * Calculates and returns simple Statistics.
     * 
     * TODO: also for nominal values.
     * 
     * @param {number[]|string} data 
     * @returns Statistics
     */
    statistics(data) {
        if (typeof data === 'string') {
            if (!(data in this.columnMap)) throw new Error('Column ' + data + ' does not exists.');
            data = this.series[this.columnMap[data]];
        }
        return new Statistics(data);
    }

    /**
     * Applies the given filter function to all rows in this data frame and
     * returns a new data frame containning all matched rows.
     * 
     * If the boolean *true* is provided, a copy of the original data frame
     * is returned. If the bookean *false* is provided, just the structure
     * of the original data frame is copied.
     * 
     * Note: the returned data frame holds references to the original data frames rows.
     * 
     * @param {Function|Boolean} filter 
     * @returns {DataFrame}
     */
    filter(filter) {
        let df = new DataFrame();
        df.description = this.description;
        df.columns = this.columns.slice();
        df.columnMap = Object.assign({}, this.columnMap);
        df.rows = [];
        df.series = [];

        // prepare series
        for (let i = 0; i < df.columns.length; i++) {
            df.series[i] = [];
            df.series[df.columns[i]] = df.series[i];
        }

        if (!filter) return df;

        // filter data
        let count = 0;
        for (let i = 0; i < this.rows.length; i++) {
            let row = this.rows[i];

            if ((filter === true) || filter(i, row)) {
                df.rows.push(row);

                for (let j = 0; j < df.columns.length; j++) {
                    df.series[j][count] = row[j];
                }

                count++;
            }
        }

        return df;
    }

    /**
     * Returns a string representation of this
     * data frame.
     * 
     * @param {string} separator 
     * @param {begin} begin optional first row
     * @param {end} end optional last row
     * @returns string
     */
    asText(separator = '\t', begin = 0, end = 0) {
        let text = this.columns.join(separator) + '\n';

        if (begin >= this.rows.length) return text;
        if (begin < 0) {
            begin = this.rows.length + begin;
            end = this.rows.length;
        }
        if (end < begin) return text;
        if (end == 0) end = this.rows.length;

        for (let i = begin; i < end; i++) {
            text +=
                this.rows[i].join(separator) + '\n';
        }

        return text;
    }

    /**
     * Returns at max count rows of the head.
     * 
     * @param {number} count 
     * @param {string} separator 
     * @returns string
     */
    head(count, separator = '\t') {
        return this.asText(separator, 0, count);
    }

    /**
     * Returns at max count rows of the tail.
     * 
     * @param {number} count 
     * @param {string} separator 
     * @returns string
     */
    tail(count, separator = '\t') {
        return this.asText(separator, -count);
    }

    /**
     * Groups values of each column for the given grouping values or given
     * function and returns a map of statistics for all columns of the data
     * frame.
     * 
     * For a function it gets the current row as parameter and have to
     * return an unique value to group rows by.
     * 
     * @param {string|function} grouping 
     * @param {string[]} columns optional list of columns to calculate statistics for
     * @returns Statistics
     */
    groups(grouping, columns = []) {
        // validate column grouping function
        let fn = null;
        if (typeof grouping === 'string') {
            if (!(grouping in this.columnMap)) throw new Error('Column ' + grouping + ' does not exists.');
            let that = this;
            fn = function (row) {
                return row[that.columnMap[grouping]];
            };
        } else {
            if (typeof grouping !== 'function') throw new Error('Column parameter must be a string or function.');
            fn = grouping;
        }

        // validate statistic columns
        if (!columns || (columns.length == 0)) {
            columns = this.columns.slice();
        } else {
            columns = columns.slice();
        }
        for (let i = 0; i < columns.length; i++) {
            if (!(columns[i] in this.columnMap)) throw new Error('Column for y ' + columns[i] + ' is not defined.');
        }

        // collect by group and column
        // key => column => values
        let groups = {};
        for (let i = 0; i < this.rows.length; i++) {
            let key = fn(this.rows[i]);

            // initialize cache
            if (!(key in groups)) {
                groups[key] = [];
            }

            // collect values
            for (let j = 0; j < columns.length; j++) {
                let values = groups[key][j];
                if (!values) values = groups[key][j] = [];
                let value = this.rows[i][this.columnMap[columns[j]]];
                values.push(value);
            }
        }

        // calculate statistics for each group key and column
        let stats = {};
        for (let group in groups) {
            stats[group] = [];
            for (let j = 0; j < columns.length; j++) {
                let statistics = new Statistics(groups[group][j]);
                stats[group][j] = statistics;
            }
        }

        return stats;
    }
}