In [20]:
# Necessary libraries
import pandas as pd
import numpy as np

In [21]:
# Load the dataset
df = pd.read_csv("meyer.csv")

# Print an initial summary
print("Initial DataFrame shape:", df.shape)
df.head()

Initial DataFrame shape: (243, 15)


Unnamed: 0,moisture,spring_stiffness,displacement_screw_setting,motor_speed,untouched,longitudinal less than 25%,Longitudinal between 25-50%,Longitudinal between 50-75%,Longitudinal more than 75%,Circumferential less than 25%,Circumferential between 25-50%,Circumferential between 50-75%,Circumferential more than 75%,Open Crack,Crushed
0,5,1800,0.29,60,,,,,"*GH013810, *GH013811, *GH013812, *GH013813, *G...",,"*GH013812, *GH013813, *GH013814",,"*GH013810, *GH013811, *GH013815","*GH013810, *GH013811, *GH013812, *GH013813, *G...",
1,5,1800,0.22,45,,,,,"*GH013816, *GH013817, *GH013818, *GH013819, *G...",,*GH013818,*GH013821,"*GH013816, *GH013817, *GH013819, *GH013820","*GH013816, *GH013817, *GH013818, *GH013819, *G...",
2,5,1800,0.36,30,*GH013822,,,*GH013823,"*GH013824, *GH013825, *GH013826, *GH013827","*GH013823, *GH013826",,"*GH013824, *GH013827",*GH013825,"*GH013823, *GH013824, *GH013825, *GH013827",
3,5,1800,0.36,60,*GH013832,,,,"*GH013828, *GH013829, *GH013830, *GH013831, *G...",*GH013829,"*GH013830, *GH013833",*GH013828,*GH013831,"*GH013828, *GH013830, *GH013831, *GH013833",
4,5,1800,0.22,30,,,,,"*GH013834, *GH013835, *GH013836, *GH013837, *G...",,,*GH013836,"*GH013834, *GH013835, *GH013837, *GH013838, *G...","*GH013834, *GH013835, *GH013836, *GH013837, *G...",


In [22]:
# Inspect columns and rename them for clarity
df.columns = df.columns.str.lower().str.strip().str.replace(" ", "_").str.replace("-", "_")

print("Renamed columns:", df.columns.tolist())

Renamed columns: ['moisture', 'spring_stiffness', 'displacement_screw_setting', 'motor_speed', 'untouched', 'longitudinal_less_than_25%', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_more_than_75%', 'circumferential_less_than_25%', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_more_than_75%', 'open_crack', 'crushed']


In [23]:
# Verify factor columns and parse them into correct data types
factor_cols = ["moisture", "spring_stiffness", "displacement_screw_setting", "motor_speed"]
for col in factor_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")
    print("Any missing values in factor columns?\n", df[factor_cols].isnull().sum())

Any missing values in factor columns?
 moisture                      0
spring_stiffness              0
displacement_screw_setting    0
motor_speed                   0
dtype: int64
Any missing values in factor columns?
 moisture                      0
spring_stiffness              0
displacement_screw_setting    0
motor_speed                   0
dtype: int64
Any missing values in factor columns?
 moisture                      0
spring_stiffness              0
displacement_screw_setting    0
motor_speed                   0
dtype: int64
Any missing values in factor columns?
 moisture                      0
spring_stiffness              0
displacement_screw_setting    0
motor_speed                   0
dtype: int64


In [24]:
# Identify the crack outcome columns
crack_cols = [c for c in df.columns if c not in factor_cols]
print("Crack columns identified:", crack_cols)

Crack columns identified: ['untouched', 'longitudinal_less_than_25%', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_more_than_75%', 'circumferential_less_than_25%', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_more_than_75%', 'open_crack', 'crushed']


In [25]:
# Count total video references per row
def extract_video_ids(cell_value):
    """
    cell_value is a string with video references (like '*GH013810, *GH013811')
    We'll split by comma, strip spaces, and return a list of IDs
    If cell_value is NaN or empty, return empty list
    """
    if pd.isna(cell_value):
        return []
    # Split on comma
    items = cell_value.split(",")
    # Clean up whitespace
    items = [x.strip() for x in items if x.strip() != ""]
    return items


# We'll accumulate all IDs across the crack columns for each row
def count_unique_videos(row):
    all_ids = []
    for col in crack_cols:
        # row[col] might be a string with multiple video references
        ids = extract_video_ids(row[col])
        all_ids.extend(ids)
    return len(set(all_ids))


df["video_count"] = df.apply(count_unique_videos, axis=1)

In [26]:
# Flag rows with fewer than 6 videos as missing
df["missing_videos_flag"] = df["video_count"] < 6
print(df[["video_count", "missing_videos_flag"]].head(10))

   video_count  missing_videos_flag
0            6                False
1            6                False
2            6                False
3            6                False
4            6                False
5            5                 True
6            6                False
7            6                False
8            6                False
9            6                False


In [27]:
# Keep those rows for now with a note
missing_count = df["missing_videos_flag"].sum()
print(f"Number of runs with fewer than 6 videos: {missing_count}")

Number of runs with fewer than 6 videos: 16


In [28]:
# Data checks for motor_speed range
valid_speeds = [30, 45, 60]
mask_invalid_speed = ~df["motor_speed"].isin(valid_speeds)
if mask_invalid_speed.any():
    print("Invalid motor_speed values found:")
    print(df.loc[mask_invalid_speed, "motor_speed"].unique())

In [29]:
# Reshape data to a "long" format
all_records = []
for idx, row in df.iterrows():
    # get factor values
    row_factors = {"moisture": row["moisture"], "spring_stiffness": row["spring_stiffness"], "displacement_screw_setting": row["displacement_screw_setting"], "motor_speed": row["motor_speed"]}
    # gather crack data
    for col in crack_cols:
        video_ids = extract_video_ids(row[col])
        # each video ID is a single pecan with a crack classification col
        # col is the classification type
        for vid in video_ids:
            # Build a record
            record = row_factors.copy()
            record["video_id"] = vid
            record["crack_type"] = col  # the name of the classification
            all_records.append(record)

df_long = pd.DataFrame(all_records)

print("df_long shape:", df_long.shape)
df_long.head(10)

df_long shape: (3769, 6)


Unnamed: 0,moisture,spring_stiffness,displacement_screw_setting,motor_speed,video_id,crack_type
0,5,1800,0.29,60,*GH013810,longitudinal_more_than_75%
1,5,1800,0.29,60,*GH013811,longitudinal_more_than_75%
2,5,1800,0.29,60,*GH013812,longitudinal_more_than_75%
3,5,1800,0.29,60,*GH013813,longitudinal_more_than_75%
4,5,1800,0.29,60,*GH013814,longitudinal_more_than_75%
5,5,1800,0.29,60,*GH013815,longitudinal_more_than_75%
6,5,1800,0.29,60,*GH013812,circumferential_between_25_50%
7,5,1800,0.29,60,*GH013813,circumferential_between_25_50%
8,5,1800,0.29,60,*GH013814,circumferential_between_25_50%
9,5,1800,0.29,60,*GH013810,circumferential_more_than_75%


In [30]:
# Save the reshaped data
df.to_csv("meyer_cleaned_wide.csv", index=False)
df_long.to_csv("meyer_cleaned_long.csv", index=False)

print("Data cleaning complete. Cleaned files saved.")

Data cleaning complete. Cleaned files saved.
