1021 lines
33 KiB
Plaintext
1021 lines
33 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Initial DataFrame shape: (243, 15)\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
|
|
"columns": [
|
|
{
|
|
"name": "index",
|
|
"rawType": "int64",
|
|
"type": "integer"
|
|
},
|
|
{
|
|
"name": "moisture",
|
|
"rawType": "int64",
|
|
"type": "integer"
|
|
},
|
|
{
|
|
"name": "spring_stiffness ",
|
|
"rawType": "int64",
|
|
"type": "integer"
|
|
},
|
|
{
|
|
"name": "displacement_screw_setting",
|
|
"rawType": "float64",
|
|
"type": "float"
|
|
},
|
|
{
|
|
"name": "motor_speed",
|
|
"rawType": "int64",
|
|
"type": "integer"
|
|
},
|
|
{
|
|
"name": "untouched",
|
|
"rawType": "object",
|
|
"type": "unknown"
|
|
},
|
|
{
|
|
"name": "longitudinal less than 25%",
|
|
"rawType": "object",
|
|
"type": "unknown"
|
|
},
|
|
{
|
|
"name": "Longitudinal between 25-50%",
|
|
"rawType": "object",
|
|
"type": "unknown"
|
|
},
|
|
{
|
|
"name": "Longitudinal between 50-75%",
|
|
"rawType": "object",
|
|
"type": "unknown"
|
|
},
|
|
{
|
|
"name": "Longitudinal more than 75%",
|
|
"rawType": "object",
|
|
"type": "string"
|
|
},
|
|
{
|
|
"name": "Circumferential less than 25%",
|
|
"rawType": "object",
|
|
"type": "unknown"
|
|
},
|
|
{
|
|
"name": "Circumferential between 25-50%",
|
|
"rawType": "object",
|
|
"type": "unknown"
|
|
},
|
|
{
|
|
"name": "Circumferential between 50-75%",
|
|
"rawType": "object",
|
|
"type": "unknown"
|
|
},
|
|
{
|
|
"name": "Circumferential more than 75%",
|
|
"rawType": "object",
|
|
"type": "string"
|
|
},
|
|
{
|
|
"name": "Open Crack",
|
|
"rawType": "object",
|
|
"type": "string"
|
|
},
|
|
{
|
|
"name": "Crushed",
|
|
"rawType": "object",
|
|
"type": "unknown"
|
|
}
|
|
],
|
|
"conversionMethod": "pd.DataFrame",
|
|
"ref": "12a85fd7-7b36-4ff2-bae9-05e2193cf380",
|
|
"rows": [
|
|
[
|
|
"0",
|
|
"5",
|
|
"1800",
|
|
"0.29",
|
|
"60",
|
|
null,
|
|
null,
|
|
null,
|
|
null,
|
|
"*GH013810, *GH013811, *GH013812, *GH013813, *GH013814, *GH013815",
|
|
null,
|
|
"*GH013812, *GH013813, *GH013814",
|
|
null,
|
|
"*GH013810, *GH013811, *GH013815",
|
|
"*GH013810, *GH013811, *GH013812, *GH013813, *GH013814, *GH013815",
|
|
null
|
|
],
|
|
[
|
|
"1",
|
|
"5",
|
|
"1800",
|
|
"0.22",
|
|
"45",
|
|
null,
|
|
null,
|
|
null,
|
|
null,
|
|
"*GH013816, *GH013817, *GH013818, *GH013819, *GH013820, *GH013821",
|
|
null,
|
|
"*GH013818",
|
|
"*GH013821",
|
|
"*GH013816, *GH013817, *GH013819, *GH013820",
|
|
"*GH013816, *GH013817, *GH013818, *GH013819, *GH013820, *GH013821",
|
|
null
|
|
],
|
|
[
|
|
"2",
|
|
"5",
|
|
"1800",
|
|
"0.36",
|
|
"30",
|
|
"*GH013822",
|
|
null,
|
|
null,
|
|
"*GH013823",
|
|
"*GH013824, *GH013825, *GH013826, *GH013827",
|
|
"*GH013823, *GH013826",
|
|
null,
|
|
"*GH013824, *GH013827",
|
|
"*GH013825",
|
|
"*GH013823, *GH013824, *GH013825, *GH013827",
|
|
null
|
|
],
|
|
[
|
|
"3",
|
|
"5",
|
|
"1800",
|
|
"0.36",
|
|
"60",
|
|
"*GH013832",
|
|
null,
|
|
null,
|
|
null,
|
|
"*GH013828, *GH013829, *GH013830, *GH013831, *GH013833",
|
|
"*GH013829",
|
|
"*GH013830, *GH013833",
|
|
"*GH013828",
|
|
"*GH013831",
|
|
"*GH013828, *GH013830, *GH013831, *GH013833",
|
|
null
|
|
],
|
|
[
|
|
"4",
|
|
"5",
|
|
"1800",
|
|
"0.22",
|
|
"30",
|
|
null,
|
|
null,
|
|
null,
|
|
null,
|
|
"*GH013834, *GH013835, *GH013836, *GH013837, *GH013838, *GH013839",
|
|
null,
|
|
null,
|
|
"*GH013836",
|
|
"*GH013834, *GH013835, *GH013837, *GH013838, *GH013839",
|
|
"*GH013834, *GH013835, *GH013836, *GH013837, *GH013838, *GH013839",
|
|
null
|
|
]
|
|
],
|
|
"shape": {
|
|
"columns": 15,
|
|
"rows": 5
|
|
}
|
|
},
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>moisture</th>\n",
|
|
" <th>spring_stiffness</th>\n",
|
|
" <th>displacement_screw_setting</th>\n",
|
|
" <th>motor_speed</th>\n",
|
|
" <th>untouched</th>\n",
|
|
" <th>longitudinal less than 25%</th>\n",
|
|
" <th>Longitudinal between 25-50%</th>\n",
|
|
" <th>Longitudinal between 50-75%</th>\n",
|
|
" <th>Longitudinal more than 75%</th>\n",
|
|
" <th>Circumferential less than 25%</th>\n",
|
|
" <th>Circumferential between 25-50%</th>\n",
|
|
" <th>Circumferential between 50-75%</th>\n",
|
|
" <th>Circumferential more than 75%</th>\n",
|
|
" <th>Open Crack</th>\n",
|
|
" <th>Crushed</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>1800</td>\n",
|
|
" <td>0.29</td>\n",
|
|
" <td>60</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>*GH013810, *GH013811, *GH013812, *GH013813, *G...</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>*GH013812, *GH013813, *GH013814</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>*GH013810, *GH013811, *GH013815</td>\n",
|
|
" <td>*GH013810, *GH013811, *GH013812, *GH013813, *G...</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>1800</td>\n",
|
|
" <td>0.22</td>\n",
|
|
" <td>45</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>*GH013816, *GH013817, *GH013818, *GH013819, *G...</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>*GH013818</td>\n",
|
|
" <td>*GH013821</td>\n",
|
|
" <td>*GH013816, *GH013817, *GH013819, *GH013820</td>\n",
|
|
" <td>*GH013816, *GH013817, *GH013818, *GH013819, *G...</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>1800</td>\n",
|
|
" <td>0.36</td>\n",
|
|
" <td>30</td>\n",
|
|
" <td>*GH013822</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>*GH013823</td>\n",
|
|
" <td>*GH013824, *GH013825, *GH013826, *GH013827</td>\n",
|
|
" <td>*GH013823, *GH013826</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>*GH013824, *GH013827</td>\n",
|
|
" <td>*GH013825</td>\n",
|
|
" <td>*GH013823, *GH013824, *GH013825, *GH013827</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>1800</td>\n",
|
|
" <td>0.36</td>\n",
|
|
" <td>60</td>\n",
|
|
" <td>*GH013832</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>*GH013828, *GH013829, *GH013830, *GH013831, *G...</td>\n",
|
|
" <td>*GH013829</td>\n",
|
|
" <td>*GH013830, *GH013833</td>\n",
|
|
" <td>*GH013828</td>\n",
|
|
" <td>*GH013831</td>\n",
|
|
" <td>*GH013828, *GH013830, *GH013831, *GH013833</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>1800</td>\n",
|
|
" <td>0.22</td>\n",
|
|
" <td>30</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>*GH013834, *GH013835, *GH013836, *GH013837, *G...</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>*GH013836</td>\n",
|
|
" <td>*GH013834, *GH013835, *GH013837, *GH013838, *G...</td>\n",
|
|
" <td>*GH013834, *GH013835, *GH013836, *GH013837, *G...</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" moisture spring_stiffness displacement_screw_setting motor_speed \\\n",
|
|
"0 5 1800 0.29 60 \n",
|
|
"1 5 1800 0.22 45 \n",
|
|
"2 5 1800 0.36 30 \n",
|
|
"3 5 1800 0.36 60 \n",
|
|
"4 5 1800 0.22 30 \n",
|
|
"\n",
|
|
" untouched longitudinal less than 25% Longitudinal between 25-50% \\\n",
|
|
"0 NaN NaN NaN \n",
|
|
"1 NaN NaN NaN \n",
|
|
"2 *GH013822 NaN NaN \n",
|
|
"3 *GH013832 NaN NaN \n",
|
|
"4 NaN NaN NaN \n",
|
|
"\n",
|
|
" Longitudinal between 50-75% \\\n",
|
|
"0 NaN \n",
|
|
"1 NaN \n",
|
|
"2 *GH013823 \n",
|
|
"3 NaN \n",
|
|
"4 NaN \n",
|
|
"\n",
|
|
" Longitudinal more than 75% \\\n",
|
|
"0 *GH013810, *GH013811, *GH013812, *GH013813, *G... \n",
|
|
"1 *GH013816, *GH013817, *GH013818, *GH013819, *G... \n",
|
|
"2 *GH013824, *GH013825, *GH013826, *GH013827 \n",
|
|
"3 *GH013828, *GH013829, *GH013830, *GH013831, *G... \n",
|
|
"4 *GH013834, *GH013835, *GH013836, *GH013837, *G... \n",
|
|
"\n",
|
|
" Circumferential less than 25% Circumferential between 25-50% \\\n",
|
|
"0 NaN *GH013812, *GH013813, *GH013814 \n",
|
|
"1 NaN *GH013818 \n",
|
|
"2 *GH013823, *GH013826 NaN \n",
|
|
"3 *GH013829 *GH013830, *GH013833 \n",
|
|
"4 NaN NaN \n",
|
|
"\n",
|
|
" Circumferential between 50-75% \\\n",
|
|
"0 NaN \n",
|
|
"1 *GH013821 \n",
|
|
"2 *GH013824, *GH013827 \n",
|
|
"3 *GH013828 \n",
|
|
"4 *GH013836 \n",
|
|
"\n",
|
|
" Circumferential more than 75% \\\n",
|
|
"0 *GH013810, *GH013811, *GH013815 \n",
|
|
"1 *GH013816, *GH013817, *GH013819, *GH013820 \n",
|
|
"2 *GH013825 \n",
|
|
"3 *GH013831 \n",
|
|
"4 *GH013834, *GH013835, *GH013837, *GH013838, *G... \n",
|
|
"\n",
|
|
" Open Crack Crushed \n",
|
|
"0 *GH013810, *GH013811, *GH013812, *GH013813, *G... NaN \n",
|
|
"1 *GH013816, *GH013817, *GH013818, *GH013819, *G... NaN \n",
|
|
"2 *GH013823, *GH013824, *GH013825, *GH013827 NaN \n",
|
|
"3 *GH013828, *GH013830, *GH013831, *GH013833 NaN \n",
|
|
"4 *GH013834, *GH013835, *GH013836, *GH013837, *G... NaN "
|
|
]
|
|
},
|
|
"execution_count": 126,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Import necessary libraries\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"\n",
|
|
"# Load the raw dataset\n",
|
|
"df = pd.read_csv(\"meyer.csv\")\n",
|
|
"print(\"Initial DataFrame shape:\", df.shape)\n",
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 127,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Renamed columns: ['moisture', 'spring_stiffness', 'displacement_screw_setting', 'motor_speed', 'untouched', 'longitudinal_less_than_25%', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_more_than_75%', 'circumferential_less_than_25%', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_more_than_75%', 'open_crack', 'crushed']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Standardize column names\n",
|
|
"df.columns = df.columns.str.lower().str.strip().str.replace(\" \", \"_\").str.replace(\"-\", \"_\")\n",
|
|
"print(\"Renamed columns:\", df.columns.tolist())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 128,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Missing values in factor columns:\n",
|
|
"moisture 0\n",
|
|
"spring_stiffness 0\n",
|
|
"displacement_screw_setting 0\n",
|
|
"motor_speed 0\n",
|
|
"dtype: int64\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Define factor columns and convert them to numeric\n",
|
|
"factor_cols = [\"moisture\", \"spring_stiffness\", \"displacement_screw_setting\", \"motor_speed\"]\n",
|
|
"for col in factor_cols:\n",
|
|
" df[col] = pd.to_numeric(df[col], errors=\"coerce\")\n",
|
|
"print(\"Missing values in factor columns:\")\n",
|
|
"print(df[factor_cols].isnull().sum())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 129,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Crack columns identified: ['untouched', 'longitudinal_less_than_25%', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_more_than_75%', 'circumferential_less_than_25%', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_more_than_75%', 'open_crack', 'crushed']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Identify crack outcome columns\n",
|
|
"crack_cols = [c for c in df.columns if c not in factor_cols]\n",
|
|
"print(\"Crack columns identified:\", crack_cols)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 130,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Define functions to extract video IDs from a cell and count unique video IDs across crack columns.\n",
|
|
"def extract_video_ids(cell_value: str) -> list:\n",
|
|
" if pd.isna(cell_value):\n",
|
|
" return []\n",
|
|
" return [x.strip() for x in cell_value.split(\",\") if x.strip() != \"\"]\n",
|
|
"\n",
|
|
"\n",
|
|
"def count_unique_videos(row: pd.Series) -> int:\n",
|
|
" all_ids = []\n",
|
|
" for col in crack_cols:\n",
|
|
" all_ids.extend(extract_video_ids(row[col]))\n",
|
|
" return len(set(all_ids))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 131,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" video_count missing_videos_flag\n",
|
|
"0 6 False\n",
|
|
"1 6 False\n",
|
|
"2 6 False\n",
|
|
"3 6 False\n",
|
|
"4 6 False\n",
|
|
"5 5 True\n",
|
|
"6 6 False\n",
|
|
"7 6 False\n",
|
|
"8 6 False\n",
|
|
"9 6 False\n",
|
|
"Number of runs with fewer than 6 videos: 16\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Compute unique video counts for each row and flag incomplete runs\n",
|
|
"df[\"video_count\"] = df.apply(count_unique_videos, axis=1)\n",
|
|
"df[\"missing_videos_flag\"] = df[\"video_count\"] < 6\n",
|
|
"print(df[[\"video_count\", \"missing_videos_flag\"]].head(10))\n",
|
|
"print(f\"Number of runs with fewer than 6 videos: {df['missing_videos_flag'].sum()}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 132,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Motor Speed values: [60 45 30]\n",
|
|
"Moisture values: [5 7 9]\n",
|
|
"Displacement Screw Setting values: [0.29 0.22 0.36]\n",
|
|
"Spring Stiffness values: [1800 2000 2200]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Verify that factor columns contain the expected values.\n",
|
|
"\n",
|
|
"# %%\n",
|
|
"print(\"Motor Speed values:\", df[\"motor_speed\"].unique()) # Expected: [30, 45, 60]\n",
|
|
"print(\"Moisture values:\", df[\"moisture\"].unique()) # Expected: [5, 7, 9]\n",
|
|
"print(\"Displacement Screw Setting values:\", df[\"displacement_screw_setting\"].unique()) # Expected: [0.22, 0.29, 0.36]\n",
|
|
"print(\"Spring Stiffness values:\", df[\"spring_stiffness\"].unique()) # Expected: [1800, 2000, 2200]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 133,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Long-format DataFrame shape: (3769, 6)\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
|
|
"columns": [
|
|
{
|
|
"name": "index",
|
|
"rawType": "int64",
|
|
"type": "integer"
|
|
},
|
|
{
|
|
"name": "moisture",
|
|
"rawType": "int64",
|
|
"type": "integer"
|
|
},
|
|
{
|
|
"name": "spring_stiffness",
|
|
"rawType": "int64",
|
|
"type": "integer"
|
|
},
|
|
{
|
|
"name": "displacement_screw_setting",
|
|
"rawType": "float64",
|
|
"type": "float"
|
|
},
|
|
{
|
|
"name": "motor_speed",
|
|
"rawType": "int64",
|
|
"type": "integer"
|
|
},
|
|
{
|
|
"name": "video_id",
|
|
"rawType": "object",
|
|
"type": "string"
|
|
},
|
|
{
|
|
"name": "crack_type",
|
|
"rawType": "object",
|
|
"type": "string"
|
|
}
|
|
],
|
|
"conversionMethod": "pd.DataFrame",
|
|
"ref": "66f7239c-e784-429e-914f-e3d50900eb0a",
|
|
"rows": [
|
|
[
|
|
"0",
|
|
"5",
|
|
"1800",
|
|
"0.29",
|
|
"60",
|
|
"*GH013810",
|
|
"longitudinal_more_than_75%"
|
|
],
|
|
[
|
|
"1",
|
|
"5",
|
|
"1800",
|
|
"0.29",
|
|
"60",
|
|
"*GH013811",
|
|
"longitudinal_more_than_75%"
|
|
],
|
|
[
|
|
"2",
|
|
"5",
|
|
"1800",
|
|
"0.29",
|
|
"60",
|
|
"*GH013812",
|
|
"longitudinal_more_than_75%"
|
|
],
|
|
[
|
|
"3",
|
|
"5",
|
|
"1800",
|
|
"0.29",
|
|
"60",
|
|
"*GH013813",
|
|
"longitudinal_more_than_75%"
|
|
],
|
|
[
|
|
"4",
|
|
"5",
|
|
"1800",
|
|
"0.29",
|
|
"60",
|
|
"*GH013814",
|
|
"longitudinal_more_than_75%"
|
|
],
|
|
[
|
|
"5",
|
|
"5",
|
|
"1800",
|
|
"0.29",
|
|
"60",
|
|
"*GH013815",
|
|
"longitudinal_more_than_75%"
|
|
],
|
|
[
|
|
"6",
|
|
"5",
|
|
"1800",
|
|
"0.29",
|
|
"60",
|
|
"*GH013812",
|
|
"circumferential_between_25_50%"
|
|
],
|
|
[
|
|
"7",
|
|
"5",
|
|
"1800",
|
|
"0.29",
|
|
"60",
|
|
"*GH013813",
|
|
"circumferential_between_25_50%"
|
|
],
|
|
[
|
|
"8",
|
|
"5",
|
|
"1800",
|
|
"0.29",
|
|
"60",
|
|
"*GH013814",
|
|
"circumferential_between_25_50%"
|
|
],
|
|
[
|
|
"9",
|
|
"5",
|
|
"1800",
|
|
"0.29",
|
|
"60",
|
|
"*GH013810",
|
|
"circumferential_more_than_75%"
|
|
]
|
|
],
|
|
"shape": {
|
|
"columns": 6,
|
|
"rows": 10
|
|
}
|
|
},
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>moisture</th>\n",
|
|
" <th>spring_stiffness</th>\n",
|
|
" <th>displacement_screw_setting</th>\n",
|
|
" <th>motor_speed</th>\n",
|
|
" <th>video_id</th>\n",
|
|
" <th>crack_type</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>1800</td>\n",
|
|
" <td>0.29</td>\n",
|
|
" <td>60</td>\n",
|
|
" <td>*GH013810</td>\n",
|
|
" <td>longitudinal_more_than_75%</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>1800</td>\n",
|
|
" <td>0.29</td>\n",
|
|
" <td>60</td>\n",
|
|
" <td>*GH013811</td>\n",
|
|
" <td>longitudinal_more_than_75%</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>1800</td>\n",
|
|
" <td>0.29</td>\n",
|
|
" <td>60</td>\n",
|
|
" <td>*GH013812</td>\n",
|
|
" <td>longitudinal_more_than_75%</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>1800</td>\n",
|
|
" <td>0.29</td>\n",
|
|
" <td>60</td>\n",
|
|
" <td>*GH013813</td>\n",
|
|
" <td>longitudinal_more_than_75%</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>1800</td>\n",
|
|
" <td>0.29</td>\n",
|
|
" <td>60</td>\n",
|
|
" <td>*GH013814</td>\n",
|
|
" <td>longitudinal_more_than_75%</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>5</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>1800</td>\n",
|
|
" <td>0.29</td>\n",
|
|
" <td>60</td>\n",
|
|
" <td>*GH013815</td>\n",
|
|
" <td>longitudinal_more_than_75%</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>6</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>1800</td>\n",
|
|
" <td>0.29</td>\n",
|
|
" <td>60</td>\n",
|
|
" <td>*GH013812</td>\n",
|
|
" <td>circumferential_between_25_50%</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>7</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>1800</td>\n",
|
|
" <td>0.29</td>\n",
|
|
" <td>60</td>\n",
|
|
" <td>*GH013813</td>\n",
|
|
" <td>circumferential_between_25_50%</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>8</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>1800</td>\n",
|
|
" <td>0.29</td>\n",
|
|
" <td>60</td>\n",
|
|
" <td>*GH013814</td>\n",
|
|
" <td>circumferential_between_25_50%</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>9</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>1800</td>\n",
|
|
" <td>0.29</td>\n",
|
|
" <td>60</td>\n",
|
|
" <td>*GH013810</td>\n",
|
|
" <td>circumferential_more_than_75%</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" moisture spring_stiffness displacement_screw_setting motor_speed \\\n",
|
|
"0 5 1800 0.29 60 \n",
|
|
"1 5 1800 0.29 60 \n",
|
|
"2 5 1800 0.29 60 \n",
|
|
"3 5 1800 0.29 60 \n",
|
|
"4 5 1800 0.29 60 \n",
|
|
"5 5 1800 0.29 60 \n",
|
|
"6 5 1800 0.29 60 \n",
|
|
"7 5 1800 0.29 60 \n",
|
|
"8 5 1800 0.29 60 \n",
|
|
"9 5 1800 0.29 60 \n",
|
|
"\n",
|
|
" video_id crack_type \n",
|
|
"0 *GH013810 longitudinal_more_than_75% \n",
|
|
"1 *GH013811 longitudinal_more_than_75% \n",
|
|
"2 *GH013812 longitudinal_more_than_75% \n",
|
|
"3 *GH013813 longitudinal_more_than_75% \n",
|
|
"4 *GH013814 longitudinal_more_than_75% \n",
|
|
"5 *GH013815 longitudinal_more_than_75% \n",
|
|
"6 *GH013812 circumferential_between_25_50% \n",
|
|
"7 *GH013813 circumferential_between_25_50% \n",
|
|
"8 *GH013814 circumferential_between_25_50% \n",
|
|
"9 *GH013810 circumferential_more_than_75% "
|
|
]
|
|
},
|
|
"execution_count": 133,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Create long-format records: one row per video ID with its corresponding factor values and crack type.\n",
|
|
"records = []\n",
|
|
"for idx, row in df.iterrows():\n",
|
|
" # Extract factor values for the run\n",
|
|
" factors = {col: row[col] for col in factor_cols}\n",
|
|
" # Iterate through each crack outcome column and extract video IDs\n",
|
|
" for col in crack_cols:\n",
|
|
" video_ids = extract_video_ids(row[col])\n",
|
|
" for vid in video_ids:\n",
|
|
" record = factors.copy()\n",
|
|
" record[\"video_id\"] = vid\n",
|
|
" record[\"crack_type\"] = col # Original crack category name\n",
|
|
" records.append(record)\n",
|
|
"\n",
|
|
"df_long = pd.DataFrame(records)\n",
|
|
"print(\"Long-format DataFrame shape:\", df_long.shape)\n",
|
|
"df_long.head(10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Raw aggregated columns: ['video_id', 'moisture', 'spring_stiffness', 'displacement_screw_setting', 'motor_speed', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_less_than_25%', 'circumferential_more_than_75%', 'crushed', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_less_than_25%', 'longitudinal_more_than_75%', 'open_crack', 'untouched']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Pivot the long-format data with 'video_id' and factors as the index, and crack_type as columns.\n",
|
|
"df_aggregated = df_long.pivot_table(index=[\"video_id\", \"moisture\", \"spring_stiffness\", \"displacement_screw_setting\", \"motor_speed\"], columns=\"crack_type\", values=\"crack_type\", aggfunc=lambda x: 1, fill_value=0).reset_index()\n",
|
|
"\n",
|
|
"# Print raw aggregated column names\n",
|
|
"print(\"Raw aggregated columns:\", df_aggregated.columns.tolist())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"After reassigning factor names:\n",
|
|
"['video', 'moisture', 'spring', 'displacement', 'motor', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_less_than_25%', 'circumferential_more_than_75%', 'crushed', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_less_than_25%', 'longitudinal_more_than_75%', 'open_crack', 'untouched']\n",
|
|
"Raw crack outcome columns: Index(['circumferential_between_25_50%', 'circumferential_between_50_75%',\n",
|
|
" 'circumferential_less_than_25%', 'circumferential_more_than_75%',\n",
|
|
" 'crushed', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%',\n",
|
|
" 'longitudinal_less_than_25%', 'longitudinal_more_than_75%',\n",
|
|
" 'open_crack', 'untouched'],\n",
|
|
" dtype='object')\n",
|
|
"Columns after renaming crack outcomes:\n",
|
|
"['video', 'moisture', 'spring', 'displacement', 'motor', 'C_2', 'C_3', 'C_1', 'C_4', 'X', 'L_2', 'L_3', 'L_1', 'L_4', 'O', 'U']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Expected factor names:\n",
|
|
"expected_factor_cols = [\"video\", \"moisture\", \"spring\", \"displacement\", \"motor\"]\n",
|
|
"\n",
|
|
"# Get current aggregated columns as a list\n",
|
|
"current_cols = df_aggregated.columns.tolist()\n",
|
|
"\n",
|
|
"# Replace the first five columns with our desired factor names.\n",
|
|
"for i in range(len(expected_factor_cols)):\n",
|
|
" current_cols[i] = expected_factor_cols[i]\n",
|
|
"df_aggregated.columns = current_cols\n",
|
|
"print(\"After reassigning factor names:\")\n",
|
|
"print(df_aggregated.columns.tolist())\n",
|
|
"\n",
|
|
"# Print remaining crack outcome columns (raw)\n",
|
|
"print(\"Raw crack outcome columns:\", df_aggregated.columns[5:])\n",
|
|
"\n",
|
|
"# Define renaming dictionary for crack outcomes:\n",
|
|
"rename_dict = {\"untouched\": \"U\", \"longitudinal_less_than_25%\": \"L_1\", \"longitudinal_between_25_50%\": \"L_2\", \"longitudinal_between_50_75%\": \"L_3\", \"longitudinal_more_than_75%\": \"L_4\", \"circumferential_less_than_25%\": \"C_1\", \"circumferential_between_25_50%\": \"C_2\", \"circumferential_between_50_75%\": \"C_3\", \"circumferential_more_than_75%\": \"C_4\", \"open_crack\": \"O\", \"crushed\": \"X\"}\n",
|
|
"\n",
|
|
"# Rename crack outcome columns (columns 6 onward)\n",
|
|
"new_crack_cols = [rename_dict.get(col, col) for col in df_aggregated.columns[5:]]\n",
|
|
"# Combine factor columns with renamed crack outcome columns\n",
|
|
"df_aggregated.columns = expected_factor_cols + new_crack_cols\n",
|
|
"print(\"Columns after renaming crack outcomes:\")\n",
|
|
"print(df_aggregated.columns.tolist())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 136,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Final aggregated binary column order:\n",
|
|
"['video', 'moisture', 'spring', 'displacement', 'motor', 'U', 'L_1', 'L_2', 'L_3', 'L_4', 'C_1', 'C_2', 'C_3', 'C_4', 'O', 'X']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Define desired order for crack outcome columns\n",
|
|
"desired_order = [\"U\", \"L_1\", \"L_2\", \"L_3\", \"L_4\", \"C_1\", \"C_2\", \"C_3\", \"C_4\", \"O\", \"X\"]\n",
|
|
"# Factor columns remain as defined\n",
|
|
"factor_order = expected_factor_cols\n",
|
|
"# Extract current crack outcome columns (from index 5 onward)\n",
|
|
"current_crack_cols = df_aggregated.columns.tolist()[5:]\n",
|
|
"# Reorder crack outcome columns based on desired order (only include those present)\n",
|
|
"new_crack_cols_ordered = [col for col in desired_order if col in current_crack_cols]\n",
|
|
"# Combine factor columns with the newly ordered crack outcome columns\n",
|
|
"new_column_order = factor_order + new_crack_cols_ordered\n",
|
|
"df_aggregated = df_aggregated[new_column_order]\n",
|
|
"print(\"Final aggregated binary column order:\")\n",
|
|
"print(df_aggregated.columns.tolist())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 137,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Final renamed aggregated binary dataset saved as 'meyer_aggregated_binary_renamed.csv'.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Save the final aggregated binary dataset with the desired column names to a single CSV file.\n",
|
|
"df_aggregated.to_csv(\"meyer_aggregated_binary_renamed.csv\", index=False)\n",
|
|
"print(\"Final renamed aggregated binary dataset saved as 'meyer_aggregated_binary_renamed.csv'.\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "pecan",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|