In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the raw dataset
df = pd.read_csv("meyer.csv")
print("Initial DataFrame shape:", df.shape)
df.head()

Initial DataFrame shape: (243, 15)


Unnamed: 0,moisture,spring_stiffness,displacement_screw_setting,motor_speed,untouched,longitudinal less than 25%,Longitudinal between 25-50%,Longitudinal between 50-75%,Longitudinal more than 75%,Circumferential less than 25%,Circumferential between 25-50%,Circumferential between 50-75%,Circumferential more than 75%,Open Crack,Crushed
0,5,1800,0.29,60,,,,,"*GH013810, *GH013811, *GH013812, *GH013813, *G...",,"*GH013812, *GH013813, *GH013814",,"*GH013810, *GH013811, *GH013815","*GH013810, *GH013811, *GH013812, *GH013813, *G...",
1,5,1800,0.22,45,,,,,"*GH013816, *GH013817, *GH013818, *GH013819, *G...",,*GH013818,*GH013821,"*GH013816, *GH013817, *GH013819, *GH013820","*GH013816, *GH013817, *GH013818, *GH013819, *G...",
2,5,1800,0.36,30,*GH013822,,,*GH013823,"*GH013824, *GH013825, *GH013826, *GH013827","*GH013823, *GH013826",,"*GH013824, *GH013827",*GH013825,"*GH013823, *GH013824, *GH013825, *GH013827",
3,5,1800,0.36,60,*GH013832,,,,"*GH013828, *GH013829, *GH013830, *GH013831, *G...",*GH013829,"*GH013830, *GH013833",*GH013828,*GH013831,"*GH013828, *GH013830, *GH013831, *GH013833",
4,5,1800,0.22,30,,,,,"*GH013834, *GH013835, *GH013836, *GH013837, *G...",,,*GH013836,"*GH013834, *GH013835, *GH013837, *GH013838, *G...","*GH013834, *GH013835, *GH013836, *GH013837, *G...",


In [127]:
# Standardize column names
df.columns = df.columns.str.lower().str.strip().str.replace(" ", "_").str.replace("-", "_")
print("Renamed columns:", df.columns.tolist())

Renamed columns: ['moisture', 'spring_stiffness', 'displacement_screw_setting', 'motor_speed', 'untouched', 'longitudinal_less_than_25%', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_more_than_75%', 'circumferential_less_than_25%', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_more_than_75%', 'open_crack', 'crushed']


In [128]:
# Define factor columns and convert them to numeric
factor_cols = ["moisture", "spring_stiffness", "displacement_screw_setting", "motor_speed"]
for col in factor_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")
print("Missing values in factor columns:")
print(df[factor_cols].isnull().sum())

Missing values in factor columns:
moisture                      0
spring_stiffness              0
displacement_screw_setting    0
motor_speed                   0
dtype: int64


In [129]:
# Identify crack outcome columns
crack_cols = [c for c in df.columns if c not in factor_cols]
print("Crack columns identified:", crack_cols)

Crack columns identified: ['untouched', 'longitudinal_less_than_25%', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_more_than_75%', 'circumferential_less_than_25%', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_more_than_75%', 'open_crack', 'crushed']


In [130]:
# Define functions to extract video IDs from a cell and count unique video IDs across crack columns.
def extract_video_ids(cell_value: str) -> list:
    if pd.isna(cell_value):
        return []
    return [x.strip() for x in cell_value.split(",") if x.strip() != ""]


def count_unique_videos(row: pd.Series) -> int:
    all_ids = []
    for col in crack_cols:
        all_ids.extend(extract_video_ids(row[col]))
    return len(set(all_ids))

In [131]:
# Compute unique video counts for each row and flag incomplete runs
df["video_count"] = df.apply(count_unique_videos, axis=1)
df["missing_videos_flag"] = df["video_count"] < 6
print(df[["video_count", "missing_videos_flag"]].head(10))
print(f"Number of runs with fewer than 6 videos: {df['missing_videos_flag'].sum()}")

   video_count  missing_videos_flag
0            6                False
1            6                False
2            6                False
3            6                False
4            6                False
5            5                 True
6            6                False
7            6                False
8            6                False
9            6                False
Number of runs with fewer than 6 videos: 16


In [132]:
# Verify that factor columns contain the expected values.

# %%
print("Motor Speed values:", df["motor_speed"].unique())  # Expected: [30, 45, 60]
print("Moisture values:", df["moisture"].unique())  # Expected: [5, 7, 9]
print("Displacement Screw Setting values:", df["displacement_screw_setting"].unique())  # Expected: [0.22, 0.29, 0.36]
print("Spring Stiffness values:", df["spring_stiffness"].unique())  # Expected: [1800, 2000, 2200]

Motor Speed values: [60 45 30]
Moisture values: [5 7 9]
Displacement Screw Setting values: [0.29 0.22 0.36]
Spring Stiffness values: [1800 2000 2200]


In [133]:
# Create long-format records: one row per video ID with its corresponding factor values and crack type.
records = []
for idx, row in df.iterrows():
    # Extract factor values for the run
    factors = {col: row[col] for col in factor_cols}
    # Iterate through each crack outcome column and extract video IDs
    for col in crack_cols:
        video_ids = extract_video_ids(row[col])
        for vid in video_ids:
            record = factors.copy()
            record["video_id"] = vid
            record["crack_type"] = col  # Original crack category name
            records.append(record)

df_long = pd.DataFrame(records)
print("Long-format DataFrame shape:", df_long.shape)
df_long.head(10)

Long-format DataFrame shape: (3769, 6)


Unnamed: 0,moisture,spring_stiffness,displacement_screw_setting,motor_speed,video_id,crack_type
0,5,1800,0.29,60,*GH013810,longitudinal_more_than_75%
1,5,1800,0.29,60,*GH013811,longitudinal_more_than_75%
2,5,1800,0.29,60,*GH013812,longitudinal_more_than_75%
3,5,1800,0.29,60,*GH013813,longitudinal_more_than_75%
4,5,1800,0.29,60,*GH013814,longitudinal_more_than_75%
5,5,1800,0.29,60,*GH013815,longitudinal_more_than_75%
6,5,1800,0.29,60,*GH013812,circumferential_between_25_50%
7,5,1800,0.29,60,*GH013813,circumferential_between_25_50%
8,5,1800,0.29,60,*GH013814,circumferential_between_25_50%
9,5,1800,0.29,60,*GH013810,circumferential_more_than_75%


In [None]:
# Pivot the long-format data with 'video_id' and factors as the index, and crack_type as columns.
df_aggregated = df_long.pivot_table(index=["video_id", "moisture", "spring_stiffness", "displacement_screw_setting", "motor_speed"], columns="crack_type", values="crack_type", aggfunc=lambda x: 1, fill_value=0).reset_index()

# Print raw aggregated column names
print("Raw aggregated columns:", df_aggregated.columns.tolist())

Raw aggregated columns: ['video_id', 'moisture', 'spring_stiffness', 'displacement_screw_setting', 'motor_speed', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_less_than_25%', 'circumferential_more_than_75%', 'crushed', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_less_than_25%', 'longitudinal_more_than_75%', 'open_crack', 'untouched']


In [None]:
# Expected factor names:
expected_factor_cols = ["video", "moisture", "spring", "displacement", "motor"]

# Get current aggregated columns as a list
current_cols = df_aggregated.columns.tolist()

# Replace the first five columns with our desired factor names.
for i in range(len(expected_factor_cols)):
    current_cols[i] = expected_factor_cols[i]
df_aggregated.columns = current_cols
print("After reassigning factor names:")
print(df_aggregated.columns.tolist())

# Print remaining crack outcome columns (raw)
print("Raw crack outcome columns:", df_aggregated.columns[5:])

# Define renaming dictionary for crack outcomes:
rename_dict = {"untouched": "U", "longitudinal_less_than_25%": "L_1", "longitudinal_between_25_50%": "L_2", "longitudinal_between_50_75%": "L_3", "longitudinal_more_than_75%": "L_4", "circumferential_less_than_25%": "C_1", "circumferential_between_25_50%": "C_2", "circumferential_between_50_75%": "C_3", "circumferential_more_than_75%": "C_4", "open_crack": "O", "crushed": "X"}

# Rename crack outcome columns (columns 6 onward)
new_crack_cols = [rename_dict.get(col, col) for col in df_aggregated.columns[5:]]
# Combine factor columns with renamed crack outcome columns
df_aggregated.columns = expected_factor_cols + new_crack_cols
print("Columns after renaming crack outcomes:")
print(df_aggregated.columns.tolist())

After reassigning factor names:
['video', 'moisture', 'spring', 'displacement', 'motor', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_less_than_25%', 'circumferential_more_than_75%', 'crushed', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_less_than_25%', 'longitudinal_more_than_75%', 'open_crack', 'untouched']
Raw crack outcome columns: Index(['circumferential_between_25_50%', 'circumferential_between_50_75%',
       'circumferential_less_than_25%', 'circumferential_more_than_75%',
       'crushed', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%',
       'longitudinal_less_than_25%', 'longitudinal_more_than_75%',
       'open_crack', 'untouched'],
      dtype='object')
Columns after renaming crack outcomes:
['video', 'moisture', 'spring', 'displacement', 'motor', 'C_2', 'C_3', 'C_1', 'C_4', 'X', 'L_2', 'L_3', 'L_1', 'L_4', 'O', 'U']


In [136]:
# Define desired order for crack outcome columns
desired_order = ["U", "L_1", "L_2", "L_3", "L_4", "C_1", "C_2", "C_3", "C_4", "O", "X"]
# Factor columns remain as defined
factor_order = expected_factor_cols
# Extract current crack outcome columns (from index 5 onward)
current_crack_cols = df_aggregated.columns.tolist()[5:]
# Reorder crack outcome columns based on desired order (only include those present)
new_crack_cols_ordered = [col for col in desired_order if col in current_crack_cols]
# Combine factor columns with the newly ordered crack outcome columns
new_column_order = factor_order + new_crack_cols_ordered
df_aggregated = df_aggregated[new_column_order]
print("Final aggregated binary column order:")
print(df_aggregated.columns.tolist())

Final aggregated binary column order:
['video', 'moisture', 'spring', 'displacement', 'motor', 'U', 'L_1', 'L_2', 'L_3', 'L_4', 'C_1', 'C_2', 'C_3', 'C_4', 'O', 'X']


In [137]:
# Save the final aggregated binary dataset with the desired column names to a single CSV file.
df_aggregated.to_csv("meyer_aggregated_binary_renamed.csv", index=False)
print("Final renamed aggregated binary dataset saved as 'meyer_aggregated_binary_renamed.csv'.")

Final renamed aggregated binary dataset saved as 'meyer_aggregated_binary_renamed.csv'.
