{
"cells": [
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# Necessary libraries\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initial DataFrame shape: (243, 15)\n"
]
},
{
"data": {
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
"columns": [
{
"name": "index",
"rawType": "int64",
"type": "integer"
},
{
"name": "moisture",
"rawType": "int64",
"type": "integer"
},
{
"name": "spring_stiffness ",
"rawType": "int64",
"type": "integer"
},
{
"name": "displacement_screw_setting",
"rawType": "float64",
"type": "float"
},
{
"name": "motor_speed",
"rawType": "int64",
"type": "integer"
},
{
"name": "untouched",
"rawType": "object",
"type": "unknown"
},
{
"name": "longitudinal less than 25%",
"rawType": "object",
"type": "unknown"
},
{
"name": "Longitudinal between 25-50%",
"rawType": "object",
"type": "unknown"
},
{
"name": "Longitudinal between 50-75%",
"rawType": "object",
"type": "unknown"
},
{
"name": "Longitudinal more than 75%",
"rawType": "object",
"type": "string"
},
{
"name": "Circumferential less than 25%",
"rawType": "object",
"type": "unknown"
},
{
"name": "Circumferential between 25-50%",
"rawType": "object",
"type": "unknown"
},
{
"name": "Circumferential between 50-75%",
"rawType": "object",
"type": "unknown"
},
{
"name": "Circumferential more than 75%",
"rawType": "object",
"type": "string"
},
{
"name": "Open Crack",
"rawType": "object",
"type": "string"
},
{
"name": "Crushed",
"rawType": "object",
"type": "unknown"
}
],
"conversionMethod": "pd.DataFrame",
"ref": "a9104025-3514-4dca-bdc7-4745f489815b",
"rows": [
[
"0",
"5",
"1800",
"0.29",
"60",
null,
null,
null,
null,
"*GH013810, *GH013811, *GH013812, *GH013813, *GH013814, *GH013815",
null,
"*GH013812, *GH013813, *GH013814",
null,
"*GH013810, *GH013811, *GH013815",
"*GH013810, *GH013811, *GH013812, *GH013813, *GH013814, *GH013815",
null
],
[
"1",
"5",
"1800",
"0.22",
"45",
null,
null,
null,
null,
"*GH013816, *GH013817, *GH013818, *GH013819, *GH013820, *GH013821",
null,
"*GH013818",
"*GH013821",
"*GH013816, *GH013817, *GH013819, *GH013820",
"*GH013816, *GH013817, *GH013818, *GH013819, *GH013820, *GH013821",
null
],
[
"2",
"5",
"1800",
"0.36",
"30",
"*GH013822",
null,
null,
"*GH013823",
"*GH013824, *GH013825, *GH013826, *GH013827",
"*GH013823, *GH013826",
null,
"*GH013824, *GH013827",
"*GH013825",
"*GH013823, *GH013824, *GH013825, *GH013827",
null
],
[
"3",
"5",
"1800",
"0.36",
"60",
"*GH013832",
null,
null,
null,
"*GH013828, *GH013829, *GH013830, *GH013831, *GH013833",
"*GH013829",
"*GH013830, *GH013833",
"*GH013828",
"*GH013831",
"*GH013828, *GH013830, *GH013831, *GH013833",
null
],
[
"4",
"5",
"1800",
"0.22",
"30",
null,
null,
null,
null,
"*GH013834, *GH013835, *GH013836, *GH013837, *GH013838, *GH013839",
null,
null,
"*GH013836",
"*GH013834, *GH013835, *GH013837, *GH013838, *GH013839",
"*GH013834, *GH013835, *GH013836, *GH013837, *GH013838, *GH013839",
null
]
],
"shape": {
"columns": 15,
"rows": 5
}
},
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" moisture | \n",
" spring_stiffness | \n",
" displacement_screw_setting | \n",
" motor_speed | \n",
" untouched | \n",
" longitudinal less than 25% | \n",
" Longitudinal between 25-50% | \n",
" Longitudinal between 50-75% | \n",
" Longitudinal more than 75% | \n",
" Circumferential less than 25% | \n",
" Circumferential between 25-50% | \n",
" Circumferential between 50-75% | \n",
" Circumferential more than 75% | \n",
" Open Crack | \n",
" Crushed | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5 | \n",
" 1800 | \n",
" 0.29 | \n",
" 60 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" *GH013810, *GH013811, *GH013812, *GH013813, *G... | \n",
" NaN | \n",
" *GH013812, *GH013813, *GH013814 | \n",
" NaN | \n",
" *GH013810, *GH013811, *GH013815 | \n",
" *GH013810, *GH013811, *GH013812, *GH013813, *G... | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" 5 | \n",
" 1800 | \n",
" 0.22 | \n",
" 45 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" *GH013816, *GH013817, *GH013818, *GH013819, *G... | \n",
" NaN | \n",
" *GH013818 | \n",
" *GH013821 | \n",
" *GH013816, *GH013817, *GH013819, *GH013820 | \n",
" *GH013816, *GH013817, *GH013818, *GH013819, *G... | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" 5 | \n",
" 1800 | \n",
" 0.36 | \n",
" 30 | \n",
" *GH013822 | \n",
" NaN | \n",
" NaN | \n",
" *GH013823 | \n",
" *GH013824, *GH013825, *GH013826, *GH013827 | \n",
" *GH013823, *GH013826 | \n",
" NaN | \n",
" *GH013824, *GH013827 | \n",
" *GH013825 | \n",
" *GH013823, *GH013824, *GH013825, *GH013827 | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" 5 | \n",
" 1800 | \n",
" 0.36 | \n",
" 60 | \n",
" *GH013832 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" *GH013828, *GH013829, *GH013830, *GH013831, *G... | \n",
" *GH013829 | \n",
" *GH013830, *GH013833 | \n",
" *GH013828 | \n",
" *GH013831 | \n",
" *GH013828, *GH013830, *GH013831, *GH013833 | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 1800 | \n",
" 0.22 | \n",
" 30 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" *GH013834, *GH013835, *GH013836, *GH013837, *G... | \n",
" NaN | \n",
" NaN | \n",
" *GH013836 | \n",
" *GH013834, *GH013835, *GH013837, *GH013838, *G... | \n",
" *GH013834, *GH013835, *GH013836, *GH013837, *G... | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" moisture spring_stiffness displacement_screw_setting motor_speed \\\n",
"0 5 1800 0.29 60 \n",
"1 5 1800 0.22 45 \n",
"2 5 1800 0.36 30 \n",
"3 5 1800 0.36 60 \n",
"4 5 1800 0.22 30 \n",
"\n",
" untouched longitudinal less than 25% Longitudinal between 25-50% \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 *GH013822 NaN NaN \n",
"3 *GH013832 NaN NaN \n",
"4 NaN NaN NaN \n",
"\n",
" Longitudinal between 50-75% \\\n",
"0 NaN \n",
"1 NaN \n",
"2 *GH013823 \n",
"3 NaN \n",
"4 NaN \n",
"\n",
" Longitudinal more than 75% \\\n",
"0 *GH013810, *GH013811, *GH013812, *GH013813, *G... \n",
"1 *GH013816, *GH013817, *GH013818, *GH013819, *G... \n",
"2 *GH013824, *GH013825, *GH013826, *GH013827 \n",
"3 *GH013828, *GH013829, *GH013830, *GH013831, *G... \n",
"4 *GH013834, *GH013835, *GH013836, *GH013837, *G... \n",
"\n",
" Circumferential less than 25% Circumferential between 25-50% \\\n",
"0 NaN *GH013812, *GH013813, *GH013814 \n",
"1 NaN *GH013818 \n",
"2 *GH013823, *GH013826 NaN \n",
"3 *GH013829 *GH013830, *GH013833 \n",
"4 NaN NaN \n",
"\n",
" Circumferential between 50-75% \\\n",
"0 NaN \n",
"1 *GH013821 \n",
"2 *GH013824, *GH013827 \n",
"3 *GH013828 \n",
"4 *GH013836 \n",
"\n",
" Circumferential more than 75% \\\n",
"0 *GH013810, *GH013811, *GH013815 \n",
"1 *GH013816, *GH013817, *GH013819, *GH013820 \n",
"2 *GH013825 \n",
"3 *GH013831 \n",
"4 *GH013834, *GH013835, *GH013837, *GH013838, *G... \n",
"\n",
" Open Crack Crushed \n",
"0 *GH013810, *GH013811, *GH013812, *GH013813, *G... NaN \n",
"1 *GH013816, *GH013817, *GH013818, *GH013819, *G... NaN \n",
"2 *GH013823, *GH013824, *GH013825, *GH013827 NaN \n",
"3 *GH013828, *GH013830, *GH013831, *GH013833 NaN \n",
"4 *GH013834, *GH013835, *GH013836, *GH013837, *G... NaN "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Load the dataset\n",
"df = pd.read_csv(\"meyer.csv\")\n",
"\n",
"# Print an initial summary\n",
"print(\"Initial DataFrame shape:\", df.shape)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Renamed columns: ['moisture', 'spring_stiffness', 'displacement_screw_setting', 'motor_speed', 'untouched', 'longitudinal_less_than_25%', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_more_than_75%', 'circumferential_less_than_25%', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_more_than_75%', 'open_crack', 'crushed']\n"
]
}
],
"source": [
"# Inspect columns and rename them for clarity\n",
"df.columns = df.columns.str.lower().str.strip().str.replace(\" \", \"_\").str.replace(\"-\", \"_\")\n",
"\n",
"print(\"Renamed columns:\", df.columns.tolist())"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Any missing values in factor columns?\n",
" moisture 0\n",
"spring_stiffness 0\n",
"displacement_screw_setting 0\n",
"motor_speed 0\n",
"dtype: int64\n",
"Any missing values in factor columns?\n",
" moisture 0\n",
"spring_stiffness 0\n",
"displacement_screw_setting 0\n",
"motor_speed 0\n",
"dtype: int64\n",
"Any missing values in factor columns?\n",
" moisture 0\n",
"spring_stiffness 0\n",
"displacement_screw_setting 0\n",
"motor_speed 0\n",
"dtype: int64\n",
"Any missing values in factor columns?\n",
" moisture 0\n",
"spring_stiffness 0\n",
"displacement_screw_setting 0\n",
"motor_speed 0\n",
"dtype: int64\n"
]
}
],
"source": [
"# Verify factor columns and parse them into correct data types\n",
"factor_cols = [\"moisture\", \"spring_stiffness\", \"displacement_screw_setting\", \"motor_speed\"]\n",
"for col in factor_cols:\n",
" df[col] = pd.to_numeric(df[col], errors=\"coerce\")\n",
" print(\"Any missing values in factor columns?\\n\", df[factor_cols].isnull().sum())"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Crack columns identified: ['untouched', 'longitudinal_less_than_25%', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_more_than_75%', 'circumferential_less_than_25%', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_more_than_75%', 'open_crack', 'crushed']\n"
]
}
],
"source": [
"# Identify the crack outcome columns\n",
"crack_cols = [c for c in df.columns if c not in factor_cols]\n",
"print(\"Crack columns identified:\", crack_cols)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# Count total video references per row\n",
"def extract_video_ids(cell_value):\n",
" \"\"\"\n",
" cell_value is a string with video references (like '*GH013810, *GH013811')\n",
" We'll split by comma, strip spaces, and return a list of IDs\n",
" If cell_value is NaN or empty, return empty list\n",
" \"\"\"\n",
" if pd.isna(cell_value):\n",
" return []\n",
" # Split on comma\n",
" items = cell_value.split(\",\")\n",
" # Clean up whitespace\n",
" items = [x.strip() for x in items if x.strip() != \"\"]\n",
" return items\n",
"\n",
"\n",
"# We'll accumulate all IDs across the crack columns for each row\n",
"def count_unique_videos(row):\n",
" all_ids = []\n",
" for col in crack_cols:\n",
" # row[col] might be a string with multiple video references\n",
" ids = extract_video_ids(row[col])\n",
" all_ids.extend(ids)\n",
" return len(set(all_ids))\n",
"\n",
"\n",
"df[\"video_count\"] = df.apply(count_unique_videos, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" video_count missing_videos_flag\n",
"0 6 False\n",
"1 6 False\n",
"2 6 False\n",
"3 6 False\n",
"4 6 False\n",
"5 5 True\n",
"6 6 False\n",
"7 6 False\n",
"8 6 False\n",
"9 6 False\n"
]
}
],
"source": [
"# Flag rows with fewer than 6 videos as missing\n",
"df[\"missing_videos_flag\"] = df[\"video_count\"] < 6\n",
"print(df[[\"video_count\", \"missing_videos_flag\"]].head(10))"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of runs with fewer than 6 videos: 16\n"
]
}
],
"source": [
"# Keep those rows for now with a note\n",
"missing_count = df[\"missing_videos_flag\"].sum()\n",
"print(f\"Number of runs with fewer than 6 videos: {missing_count}\")"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# Data checks for motor_speed range\n",
"valid_speeds = [30, 45, 60]\n",
"mask_invalid_speed = ~df[\"motor_speed\"].isin(valid_speeds)\n",
"if mask_invalid_speed.any():\n",
" print(\"Invalid motor_speed values found:\")\n",
" print(df.loc[mask_invalid_speed, \"motor_speed\"].unique())"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"df_long shape: (3769, 6)\n"
]
},
{
"data": {
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
"columns": [
{
"name": "index",
"rawType": "int64",
"type": "integer"
},
{
"name": "moisture",
"rawType": "int64",
"type": "integer"
},
{
"name": "spring_stiffness",
"rawType": "int64",
"type": "integer"
},
{
"name": "displacement_screw_setting",
"rawType": "float64",
"type": "float"
},
{
"name": "motor_speed",
"rawType": "int64",
"type": "integer"
},
{
"name": "video_id",
"rawType": "object",
"type": "string"
},
{
"name": "crack_type",
"rawType": "object",
"type": "string"
}
],
"conversionMethod": "pd.DataFrame",
"ref": "c2bb5139-cf11-4a42-8dfc-98a822b57b6c",
"rows": [
[
"0",
"5",
"1800",
"0.29",
"60",
"*GH013810",
"longitudinal_more_than_75%"
],
[
"1",
"5",
"1800",
"0.29",
"60",
"*GH013811",
"longitudinal_more_than_75%"
],
[
"2",
"5",
"1800",
"0.29",
"60",
"*GH013812",
"longitudinal_more_than_75%"
],
[
"3",
"5",
"1800",
"0.29",
"60",
"*GH013813",
"longitudinal_more_than_75%"
],
[
"4",
"5",
"1800",
"0.29",
"60",
"*GH013814",
"longitudinal_more_than_75%"
],
[
"5",
"5",
"1800",
"0.29",
"60",
"*GH013815",
"longitudinal_more_than_75%"
],
[
"6",
"5",
"1800",
"0.29",
"60",
"*GH013812",
"circumferential_between_25_50%"
],
[
"7",
"5",
"1800",
"0.29",
"60",
"*GH013813",
"circumferential_between_25_50%"
],
[
"8",
"5",
"1800",
"0.29",
"60",
"*GH013814",
"circumferential_between_25_50%"
],
[
"9",
"5",
"1800",
"0.29",
"60",
"*GH013810",
"circumferential_more_than_75%"
]
],
"shape": {
"columns": 6,
"rows": 10
}
},
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" moisture | \n",
" spring_stiffness | \n",
" displacement_screw_setting | \n",
" motor_speed | \n",
" video_id | \n",
" crack_type | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5 | \n",
" 1800 | \n",
" 0.29 | \n",
" 60 | \n",
" *GH013810 | \n",
" longitudinal_more_than_75% | \n",
"
\n",
" \n",
" 1 | \n",
" 5 | \n",
" 1800 | \n",
" 0.29 | \n",
" 60 | \n",
" *GH013811 | \n",
" longitudinal_more_than_75% | \n",
"
\n",
" \n",
" 2 | \n",
" 5 | \n",
" 1800 | \n",
" 0.29 | \n",
" 60 | \n",
" *GH013812 | \n",
" longitudinal_more_than_75% | \n",
"
\n",
" \n",
" 3 | \n",
" 5 | \n",
" 1800 | \n",
" 0.29 | \n",
" 60 | \n",
" *GH013813 | \n",
" longitudinal_more_than_75% | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 1800 | \n",
" 0.29 | \n",
" 60 | \n",
" *GH013814 | \n",
" longitudinal_more_than_75% | \n",
"
\n",
" \n",
" 5 | \n",
" 5 | \n",
" 1800 | \n",
" 0.29 | \n",
" 60 | \n",
" *GH013815 | \n",
" longitudinal_more_than_75% | \n",
"
\n",
" \n",
" 6 | \n",
" 5 | \n",
" 1800 | \n",
" 0.29 | \n",
" 60 | \n",
" *GH013812 | \n",
" circumferential_between_25_50% | \n",
"
\n",
" \n",
" 7 | \n",
" 5 | \n",
" 1800 | \n",
" 0.29 | \n",
" 60 | \n",
" *GH013813 | \n",
" circumferential_between_25_50% | \n",
"
\n",
" \n",
" 8 | \n",
" 5 | \n",
" 1800 | \n",
" 0.29 | \n",
" 60 | \n",
" *GH013814 | \n",
" circumferential_between_25_50% | \n",
"
\n",
" \n",
" 9 | \n",
" 5 | \n",
" 1800 | \n",
" 0.29 | \n",
" 60 | \n",
" *GH013810 | \n",
" circumferential_more_than_75% | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" moisture spring_stiffness displacement_screw_setting motor_speed \\\n",
"0 5 1800 0.29 60 \n",
"1 5 1800 0.29 60 \n",
"2 5 1800 0.29 60 \n",
"3 5 1800 0.29 60 \n",
"4 5 1800 0.29 60 \n",
"5 5 1800 0.29 60 \n",
"6 5 1800 0.29 60 \n",
"7 5 1800 0.29 60 \n",
"8 5 1800 0.29 60 \n",
"9 5 1800 0.29 60 \n",
"\n",
" video_id crack_type \n",
"0 *GH013810 longitudinal_more_than_75% \n",
"1 *GH013811 longitudinal_more_than_75% \n",
"2 *GH013812 longitudinal_more_than_75% \n",
"3 *GH013813 longitudinal_more_than_75% \n",
"4 *GH013814 longitudinal_more_than_75% \n",
"5 *GH013815 longitudinal_more_than_75% \n",
"6 *GH013812 circumferential_between_25_50% \n",
"7 *GH013813 circumferential_between_25_50% \n",
"8 *GH013814 circumferential_between_25_50% \n",
"9 *GH013810 circumferential_more_than_75% "
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Reshape data to a \"long\" format\n",
"all_records = []\n",
"for idx, row in df.iterrows():\n",
" # get factor values\n",
" row_factors = {\"moisture\": row[\"moisture\"], \"spring_stiffness\": row[\"spring_stiffness\"], \"displacement_screw_setting\": row[\"displacement_screw_setting\"], \"motor_speed\": row[\"motor_speed\"]}\n",
" # gather crack data\n",
" for col in crack_cols:\n",
" video_ids = extract_video_ids(row[col])\n",
" # each video ID is a single pecan with a crack classification col\n",
" # col is the classification type\n",
" for vid in video_ids:\n",
" # Build a record\n",
" record = row_factors.copy()\n",
" record[\"video_id\"] = vid\n",
" record[\"crack_type\"] = col # the name of the classification\n",
" all_records.append(record)\n",
"\n",
"df_long = pd.DataFrame(all_records)\n",
"\n",
"print(\"df_long shape:\", df_long.shape)\n",
"df_long.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Data cleaning complete. Cleaned files saved.\n"
]
}
],
"source": [
"# Save the reshaped data\n",
"df.to_csv(\"meyer_cleaned_wide.csv\", index=False)\n",
"df_long.to_csv(\"meyer_cleaned_long.csv\", index=False)\n",
"\n",
"print(\"Data cleaning complete. Cleaned files saved.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "pecan",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}