In [10]:
# %% [markdown]
# # Data Cleaning, Reshaping, and Aggregation for Meyer Pecan Cracking Dataset
#
# This notebook loads the raw dataset ("meyer.csv"), standardizes column names and data types,
# extracts video IDs from crack outcome columns, reshapes the data into long format, and then aggregates
# the data into a binary indicator format (one row per video) with concise crack outcome column names.
#
# The final crack outcome columns will be renamed as follows:
#   U   : untouched
#   L_1 : longitudinal_less_than_25%
#   L_2 : longitudinal_between_25_50%
#   L_3 : longitudinal_between_50_75%
#   L_4 : longitudinal_more_than_75%
#   C_1 : circumferential_less_than_25%
#   C_2 : circumferential_between_25_50%
#   C_3 : circumferential_between_50_75%
#   C_4 : circumferential_more_than_75%
#   O   : open_crack
#   X   : crushed
#
# Only one final CSV file ("meyer_aggregated_binary_renamed.csv") is saved.

# %% [markdown]
# ## Cell 1: Import Libraries and Load Data

# %%
import pandas as pd
import numpy as np

# Load the raw dataset
df = pd.read_csv("meyer.csv")
print("Initial DataFrame shape:", df.shape)
df.head()

# %% [markdown]
# ## Cell 2: Standardize Column Names
# Convert all column names to lower case, strip extra spaces, and replace spaces/hyphens with underscores.

# %%
df.columns = df.columns.str.lower().str.strip().str.replace(" ", "_").str.replace("-", "_")
print("Renamed columns:", df.columns.tolist())

# %% [markdown]
# ## Cell 3: Ensure Factor Columns are Numeric
# Convert factor columns to numeric and verify missing values.

# %%
factor_cols = ["moisture", "spring_stiffness", "displacement_screw_setting", "motor_speed"]
for col in factor_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")
print("Missing values in factor columns:")
print(df[factor_cols].isnull().sum())

# %% [markdown]
# ## Cell 4: Identify Crack Outcome Columns
# All columns not in factor_cols are considered crack outcome columns.

# %%
crack_cols = [c for c in df.columns if c not in factor_cols]
print("Crack columns identified:", crack_cols)

# %% [markdown]
# ## Cell 5: Define Helper Functions for Video Extraction
# Define functions to extract video IDs from a cell and count unique video IDs across the crack outcome columns.


# %%
def extract_video_ids(cell_value: str) -> list:
    """
    Extracts and cleans video IDs from a cell value.
    Splits the string by commas, strips whitespace, and returns a list of non-empty IDs.
    """
    if pd.isna(cell_value):
        return []
    return [x.strip() for x in cell_value.split(",") if x.strip() != ""]


def count_unique_videos(row: pd.Series) -> int:
    """
    Counts the number of unique video IDs across all crack outcome columns in a row.
    """
    all_ids = []
    for col in crack_cols:
        all_ids.extend(extract_video_ids(row[col]))
    return len(set(all_ids))


# %% [markdown]
# ## Cell 6: Compute Video Count and Flag Incomplete Runs
# Add 'video_count' and 'missing_videos_flag' columns.

# %%
df["video_count"] = df.apply(count_unique_videos, axis=1)
df["missing_videos_flag"] = df["video_count"] < 6
print(df[["video_count", "missing_videos_flag"]].head(10))
print(f"Number of runs with fewer than 6 videos: {df['missing_videos_flag'].sum()}")

# %% [markdown]
# ## Cell 7: Data Integrity Checks for Factor Columns
# Verify that factor values are within expected ranges.

# %%
print("Motor Speed values:", df["motor_speed"].unique())  # Expected: [30, 45, 60]
print("Moisture values:", df["moisture"].unique())  # Expected: [5, 7, 9]
print("Displacement Screw Setting values:", df["displacement_screw_setting"].unique())  # Expected: [0.22, 0.29, 0.36]
print("Spring Stiffness values:", df["spring_stiffness"].unique())  # Expected: [1800, 2000, 2200]

# %% [markdown]
# ## Cell 8: Reshape Data to Long Format
# Convert the wide-format dataset to long format so that each row corresponds to one video (pecan) with its crack type.

# %%
records = []
for idx, row in df.iterrows():
    # Extract factor values for this run.
    factors = {col: row[col] for col in factor_cols}
    # For each crack outcome column, extract video IDs.
    for col in crack_cols:
        video_ids = extract_video_ids(row[col])
        for vid in video_ids:
            record = factors.copy()
            record["video_id"] = vid
            record["crack_type"] = col  # Use the original crack category name.
            records.append(record)

df_long = pd.DataFrame(records)
print("Long-format DataFrame shape:", df_long.shape)
df_long.head(10)

# %% [markdown]
# ## Cell 9: Aggregate Long-Format Data into a Binary Format
# Aggregate the long-format data so that each unique video_id (with its factors) has binary indicators
# for each crack outcome. We use the pivot method with the 'values' parameter to preserve full column names.

# %%
df_aggregated = df_long.pivot_table(index=["video_id", "moisture", "spring_stiffness", "displacement_screw_setting", "motor_speed"], columns="crack_type", values="crack_type", aggfunc=lambda x: 1, fill_value=0).reset_index()  # using the crack_type values themselves

# Check the raw column names after pivoting
print("Raw aggregated columns:", df_aggregated.columns.tolist())

# %% [markdown]
# ## Cell 10: Reassign Factor Column Names and Rename Crack Outcome Columns
# We explicitly set the first 5 columns to our expected factor names, then rename the remaining crack columns
# using the following concise mapping:
#
#   U   : untouched
#   L_1 : longitudinal_less_than_25%
#   L_2 : longitudinal_between_25_50%
#   L_3 : longitudinal_between_50_75%
#   L_4 : longitudinal_more_than_75%
#   C_1 : circumferential_less_than_25%
#   C_2 : circumferential_between_25_50%
#   C_3 : circumferential_between_50_75%
#   C_4 : circumferential_more_than_75%
#   O   : open_crack
#   X   : crushed

# %%
# Expected factor column names in order:
expected_factor_cols = ["video_id", "moisture", "spring_stiffness", "displacement_screw_setting", "motor_speed"]

# Get current column names from df_aggregated
raw_cols = df_aggregated.columns.tolist()

# Replace the first 5 columns with expected factor names:
for i in range(len(expected_factor_cols)):
    raw_cols[i] = expected_factor_cols[i]
df_aggregated.columns = raw_cols

print("After reassigning factor names:")
print(df_aggregated.columns.tolist())

# The remaining columns are the crack outcome columns (should be full names from df_long)
print("Crack outcome columns (raw):", df_aggregated.columns[5:])

# Define the renaming dictionary for crack outcomes:
rename_dict = {"untouched": "U", "longitudinal_less_than_25%": "L_1", "longitudinal_between_25_50%": "L_2", "longitudinal_between_50_75%": "L_3", "longitudinal_more_than_75%": "L_4", "circumferential_less_than_25%": "C_1", "circumferential_between_25_50%": "C_2", "circumferential_between_50_75%": "C_3", "circumferential_more_than_75%": "C_4", "open_crack": "O", "crushed": "X"}

# Rename crack outcome columns from index 5 onward using our mapping:
new_crack_cols = [rename_dict.get(col, col) for col in df_aggregated.columns[5:]]
df_aggregated.columns = expected_factor_cols + new_crack_cols

print("Final aggregated binary column names:")
print(df_aggregated.columns.tolist())

# %% [markdown]
# ## Cell 11: Save the Final Aggregated Binary Dataset
# Save the final aggregated binary dataset as a single CSV file.

# %%
df_aggregated.to_csv("meyer_aggregated_binary_renamed.csv", index=False)
print("Final renamed aggregated binary dataset saved as 'meyer_aggregated_binary_renamed.csv'.")

Initial DataFrame shape: (243, 15)
Renamed columns: ['moisture', 'spring_stiffness', 'displacement_screw_setting', 'motor_speed', 'untouched', 'longitudinal_less_than_25%', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_more_than_75%', 'circumferential_less_than_25%', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_more_than_75%', 'open_crack', 'crushed']
Missing values in factor columns:
moisture                      0
spring_stiffness              0
displacement_screw_setting    0
motor_speed                   0
dtype: int64
Crack columns identified: ['untouched', 'longitudinal_less_than_25%', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_more_than_75%', 'circumferential_less_than_25%', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_more_than_75%', 'open_crack', 'crushed']
   video_count  missing_videos_flag
0            6                False
1       

In [11]:
# Desired order for crack outcome columns:
desired_order = ["U", "L_1", "L_2", "L_3", "L_4", "C_1", "C_2", "C_3", "C_4", "O", "X"]

# The first 5 columns (factors) remain unchanged
factor_order = ["video_id", "moisture", "spring_stiffness", "displacement_screw_setting", "motor_speed"]

# For the remaining columns, re-order based on desired_order.
# Create a list of current crack outcome columns from df_aggregated (assuming factor columns are the first 5).
current_crack_cols = df_aggregated.columns.tolist()[5:]

# Now, force the order as desired. This assumes each desired name is present.
new_crack_cols = [col for col in desired_order if col in current_crack_cols]

# Combine the factor columns with the newly ordered crack outcome columns.
new_column_order = factor_order + new_crack_cols

# Reorder the DataFrame's columns.
df_aggregated = df_aggregated[new_column_order]
print("Reordered aggregated binary columns:")
print(df_aggregated.columns.tolist())

Reordered aggregated binary columns:
['video_id', 'moisture', 'spring_stiffness', 'displacement_screw_setting', 'motor_speed', 'U', 'L_1', 'L_2', 'L_3', 'L_4', 'C_1', 'C_2', 'C_3', 'C_4', 'O', 'X']


In [12]:
df_aggregated.to_csv("meyer_aggregated_binary_renamed.csv", index=False)
print("Final renamed aggregated binary dataset saved as 'meyer_aggregated_binary_renamed.csv'.")

Final renamed aggregated binary dataset saved as 'meyer_aggregated_binary_renamed.csv'.
