Module saePisan.service.exploration.SummaryData

Functions

def extract_formatted_multiple(r_output: str) ‑> tuple[polars.dataframe.frame.DataFrame, polars.dataframe.frame.DataFrame]
Expand source code
def extract_formatted_multiple(r_output: str) -> tuple[pl.DataFrame, pl.DataFrame]:
    # Clean r_output and split into lines
    lines = r_output.strip().strip("[]").split('\n')
    lines = [line.strip(" '") for line in lines if line.strip()]

    # Regex for numeric summary (Min., 1st Qu., ..., Max.)
    pattern_num = r'^\d+\s+([A-Za-z0-9_.\s]+)\s+(Min\.|1st Qu\.|Median|Mean|3rd Qu\.|Max\.|NA\'s)\s*:\s*(<NA>|[-\d.]+)'

    
    # Regex for string summary (Length, Class, Mode)
    pattern_char = r'^\d+\s+([A-Za-z0-9_.\s]+)\s+(Length|Class|Mode)\s*:\s*(.*)'

    stat_label_map = {
        "Min.": "Minimum",
        "1st Qu.": "Quartil 1",
        "Median": "Median",
        "Mean": "Mean",
        "3rd Qu.": "Quartil 3",
        "Max.": "Maximum",
        "NA's": "Missing",
    }

    stat_order = ["Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.", "NA's"]
    string_keys = ["Length", "Class", "Mode"]

    # Save summary results
    numeric_summary = defaultdict(dict)
    string_summary = defaultdict(dict)

    for line in lines:
        if match := re.match(pattern_num, line):
            var_name = match.group(1).strip()
            stat = match.group(2)
            value_str = match.group(3)
            value = None if value_str == "<NA>" else float(value_str)
            numeric_summary[var_name][stat] = value
        elif match := re.match(pattern_char, line):
            var_name = match.group(1).strip()
            stat = match.group(2)
            value = match.group(3).strip()
            string_summary[var_name][stat] = value


    # Create numeric DataFrame
    num_result = {
        "Variable Name": [],
        "Minimum": [],
        "Quartil 1": [],
        "Median": [],
        "Mean": [],
        "Quartil 3": [],
        "Maximum": [],
        "Missing": []

    }

    for var in numeric_summary:
        num_result["Variable Name"].append(var)
        for stat in stat_order:
            num_result[stat_label_map[stat]].append(numeric_summary[var].get(stat, None))

    # Create string DataFrame
    char_result = {
        "Variable Name": [],
        "Length": [],
        "Class": [],
        "Mode": []
    }

    for var in string_summary:
        char_result["Variable Name"].append(var)
        for key in string_keys:
            char_result[key].append(string_summary[var].get(key, None))

    return pl.DataFrame(num_result), pl.DataFrame(char_result)
def extract_formatted_single(r_output: str, r_script: str) ‑> polars.dataframe.frame.DataFrame
Expand source code
def extract_formatted_single(r_output: str, r_script: str) -> pl.DataFrame:

    # Extract variable name from R script
    pattern = r'(?<=\(")(.*?)(?="\))'
    variable_name = re.search(pattern, r_script).group(0)

    # Mapping labels
    mapping = {
        "Min.": "Minimum", 
        "1st Qu.": "Quartile 1",
        "Median": "Median",
        "Mean": "Mean",
        "3rd Qu.": "Quartile 3",
        "Max.": "Maximum",
        "Length": "Length",
        "Class": "Class",
        "Mode": "Mode"
    }

    # Clean and split lines
    lines = r_output.strip().split('\n')
    lines = [line.strip() for line in lines if line.strip()]

    # Initialize dictionary with Variable Name column
    data = {"Variable Name": [variable_name]}

    found_stats = set()

    for line in lines:
        parts = line.split()
        if len(parts) >= 3:
            stat = " ".join(parts[1:-1])
            value = parts[-1]
        else:
            continue

        label = mapping.get(stat, stat)
        found_stats.add(label)

        try:
            value = float(value)
        except ValueError:
            value = str(value)

        data[label] = [value]
        
    if "Length" in data and data["Length"][0] is not None:
        data["Class"] = ["character"]
        data["Mode"] = ["character"]

    return pl.DataFrame(data)
def run_summary_data(parent)
Expand source code
def run_summary_data(parent):
    """
    Run data summary using Python (Polars) and R.
    """
    parent.activate_R()

    # Get data from model
    df1 = parent.model1.get_data()
    df2 = parent.model2.get_data()

    # Combine data using Polars
    df = pl.concat([df1, df2], how="horizontal")
    df = df.filter(~pl.all_horizontal(pl.all().is_null()))
    df = df.filter(~pl.all_horizontal(pl.all().is_null()))
    get_data(parent,df)

    try:
        # Set data in R
        ro.r('data <- as.data.frame(r_df)')

        # Execute R script from parent
        ro.r(parent.r_script)

        # Create summary_table in R
        ro.r('''
            if (is.matrix(summary_results)) {
                summary_table <- as.data.frame(summary_results)
            } else if (is.atomic(summary_results)) {
                summary_table <- data.frame(stat = names(summary_results), value = as.numeric(summary_results))
            }
        ''')

        # Get the number of rows and columns from the summary_table in R
        ncol = int(ro.r('ncol(summary_table)')[0])

        # Get the output summary_table as a string
        summary_str = ro.r('capture.output(print(summary_table))')
        summary_str_joined = "\n".join(summary_str)
        print("[DEBUG] Summary String:", summary_str_joined)
        parent.result = {}

        if ncol == 2 :
            summary_table = extract_formatted_single(summary_str_joined, parent.r_script)
            parent.result["Summary Table"] = summary_table
        else:
            # Use extract_summary for multi-variable data
            numeric_df, string_df = extract_formatted_multiple(summary_str_joined)

            if numeric_df.shape[0] > 0:
                parent.result["Summary Table (Numeric)"] = numeric_df
            if string_df.shape[0] > 0:
                parent.result["Summary Table (Character)"] = string_df

    except Exception as e:
        parent.result = str(e)
        parent.error = True

Run data summary using Python (Polars) and R.