333 lines
15 KiB
Plaintext
333 lines
15 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Initial DataFrame shape: (243, 15)\n",
|
|
"Renamed columns: ['moisture', 'spring_stiffness', 'displacement_screw_setting', 'motor_speed', 'untouched', 'longitudinal_less_than_25%', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_more_than_75%', 'circumferential_less_than_25%', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_more_than_75%', 'open_crack', 'crushed']\n",
|
|
"Missing values in factor columns:\n",
|
|
"moisture 0\n",
|
|
"spring_stiffness 0\n",
|
|
"displacement_screw_setting 0\n",
|
|
"motor_speed 0\n",
|
|
"dtype: int64\n",
|
|
"Crack columns identified: ['untouched', 'longitudinal_less_than_25%', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_more_than_75%', 'circumferential_less_than_25%', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_more_than_75%', 'open_crack', 'crushed']\n",
|
|
" video_count missing_videos_flag\n",
|
|
"0 6 False\n",
|
|
"1 6 False\n",
|
|
"2 6 False\n",
|
|
"3 6 False\n",
|
|
"4 6 False\n",
|
|
"5 5 True\n",
|
|
"6 6 False\n",
|
|
"7 6 False\n",
|
|
"8 6 False\n",
|
|
"9 6 False\n",
|
|
"Number of runs with fewer than 6 videos: 16\n",
|
|
"Motor Speed values: [60 45 30]\n",
|
|
"Moisture values: [5 7 9]\n",
|
|
"Displacement Screw Setting values: [0.29 0.22 0.36]\n",
|
|
"Spring Stiffness values: [1800 2000 2200]\n",
|
|
"Long-format DataFrame shape: (3769, 6)\n",
|
|
"Raw aggregated columns: ['video_id', 'moisture', 'spring_stiffness', 'displacement_screw_setting', 'motor_speed', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_less_than_25%', 'circumferential_more_than_75%', 'crushed', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_less_than_25%', 'longitudinal_more_than_75%', 'open_crack', 'untouched']\n",
|
|
"After reassigning factor names:\n",
|
|
"['video_id', 'moisture', 'spring_stiffness', 'displacement_screw_setting', 'motor_speed', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_less_than_25%', 'circumferential_more_than_75%', 'crushed', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_less_than_25%', 'longitudinal_more_than_75%', 'open_crack', 'untouched']\n",
|
|
"Crack outcome columns (raw): Index(['circumferential_between_25_50%', 'circumferential_between_50_75%',\n",
|
|
" 'circumferential_less_than_25%', 'circumferential_more_than_75%',\n",
|
|
" 'crushed', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%',\n",
|
|
" 'longitudinal_less_than_25%', 'longitudinal_more_than_75%',\n",
|
|
" 'open_crack', 'untouched'],\n",
|
|
" dtype='object')\n",
|
|
"Final aggregated binary column names:\n",
|
|
"['video_id', 'moisture', 'spring_stiffness', 'displacement_screw_setting', 'motor_speed', 'C_2', 'C_3', 'C_1', 'C_4', 'X', 'L_2', 'L_3', 'L_1', 'L_4', 'O', 'U']\n",
|
|
"Final renamed aggregated binary dataset saved as 'meyer_aggregated_binary_renamed.csv'.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# %% [markdown]\n",
|
|
"# # Data Cleaning, Reshaping, and Aggregation for Meyer Pecan Cracking Dataset\n",
|
|
"#\n",
|
|
"# This notebook loads the raw dataset (\"meyer.csv\"), standardizes column names and data types,\n",
|
|
"# extracts video IDs from crack outcome columns, reshapes the data into long format, and then aggregates\n",
|
|
"# the data into a binary indicator format (one row per video) with concise crack outcome column names.\n",
|
|
"#\n",
|
|
"# The final crack outcome columns will be renamed as follows:\n",
|
|
"# U : untouched\n",
|
|
"# L_1 : longitudinal_less_than_25%\n",
|
|
"# L_2 : longitudinal_between_25_50%\n",
|
|
"# L_3 : longitudinal_between_50_75%\n",
|
|
"# L_4 : longitudinal_more_than_75%\n",
|
|
"# C_1 : circumferential_less_than_25%\n",
|
|
"# C_2 : circumferential_between_25_50%\n",
|
|
"# C_3 : circumferential_between_50_75%\n",
|
|
"# C_4 : circumferential_more_than_75%\n",
|
|
"# O : open_crack\n",
|
|
"# X : crushed\n",
|
|
"#\n",
|
|
"# Only one final CSV file (\"meyer_aggregated_binary_renamed.csv\") is saved.\n",
|
|
"\n",
|
|
"# %% [markdown]\n",
|
|
"# ## Cell 1: Import Libraries and Load Data\n",
|
|
"\n",
|
|
"# %%\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"\n",
|
|
"# Load the raw dataset\n",
|
|
"df = pd.read_csv(\"meyer.csv\")\n",
|
|
"print(\"Initial DataFrame shape:\", df.shape)\n",
|
|
"df.head()\n",
|
|
"\n",
|
|
"# %% [markdown]\n",
|
|
"# ## Cell 2: Standardize Column Names\n",
|
|
"# Convert all column names to lower case, strip extra spaces, and replace spaces/hyphens with underscores.\n",
|
|
"\n",
|
|
"# %%\n",
|
|
"df.columns = df.columns.str.lower().str.strip().str.replace(\" \", \"_\").str.replace(\"-\", \"_\")\n",
|
|
"print(\"Renamed columns:\", df.columns.tolist())\n",
|
|
"\n",
|
|
"# %% [markdown]\n",
|
|
"# ## Cell 3: Ensure Factor Columns are Numeric\n",
|
|
"# Convert factor columns to numeric and verify missing values.\n",
|
|
"\n",
|
|
"# %%\n",
|
|
"factor_cols = [\"moisture\", \"spring_stiffness\", \"displacement_screw_setting\", \"motor_speed\"]\n",
|
|
"for col in factor_cols:\n",
|
|
" df[col] = pd.to_numeric(df[col], errors=\"coerce\")\n",
|
|
"print(\"Missing values in factor columns:\")\n",
|
|
"print(df[factor_cols].isnull().sum())\n",
|
|
"\n",
|
|
"# %% [markdown]\n",
|
|
"# ## Cell 4: Identify Crack Outcome Columns\n",
|
|
"# All columns not in factor_cols are considered crack outcome columns.\n",
|
|
"\n",
|
|
"# %%\n",
|
|
"crack_cols = [c for c in df.columns if c not in factor_cols]\n",
|
|
"print(\"Crack columns identified:\", crack_cols)\n",
|
|
"\n",
|
|
"# %% [markdown]\n",
|
|
"# ## Cell 5: Define Helper Functions for Video Extraction\n",
|
|
"# Define functions to extract video IDs from a cell and count unique video IDs across the crack outcome columns.\n",
|
|
"\n",
|
|
"\n",
|
|
"# %%\n",
|
|
"def extract_video_ids(cell_value: str) -> list:\n",
|
|
" \"\"\"\n",
|
|
" Extracts and cleans video IDs from a cell value.\n",
|
|
" Splits the string by commas, strips whitespace, and returns a list of non-empty IDs.\n",
|
|
" \"\"\"\n",
|
|
" if pd.isna(cell_value):\n",
|
|
" return []\n",
|
|
" return [x.strip() for x in cell_value.split(\",\") if x.strip() != \"\"]\n",
|
|
"\n",
|
|
"\n",
|
|
"def count_unique_videos(row: pd.Series) -> int:\n",
|
|
" \"\"\"\n",
|
|
" Counts the number of unique video IDs across all crack outcome columns in a row.\n",
|
|
" \"\"\"\n",
|
|
" all_ids = []\n",
|
|
" for col in crack_cols:\n",
|
|
" all_ids.extend(extract_video_ids(row[col]))\n",
|
|
" return len(set(all_ids))\n",
|
|
"\n",
|
|
"\n",
|
|
"# %% [markdown]\n",
|
|
"# ## Cell 6: Compute Video Count and Flag Incomplete Runs\n",
|
|
"# Add 'video_count' and 'missing_videos_flag' columns.\n",
|
|
"\n",
|
|
"# %%\n",
|
|
"df[\"video_count\"] = df.apply(count_unique_videos, axis=1)\n",
|
|
"df[\"missing_videos_flag\"] = df[\"video_count\"] < 6\n",
|
|
"print(df[[\"video_count\", \"missing_videos_flag\"]].head(10))\n",
|
|
"print(f\"Number of runs with fewer than 6 videos: {df['missing_videos_flag'].sum()}\")\n",
|
|
"\n",
|
|
"# %% [markdown]\n",
|
|
"# ## Cell 7: Data Integrity Checks for Factor Columns\n",
|
|
"# Verify that factor values are within expected ranges.\n",
|
|
"\n",
|
|
"# %%\n",
|
|
"print(\"Motor Speed values:\", df[\"motor_speed\"].unique()) # Expected: [30, 45, 60]\n",
|
|
"print(\"Moisture values:\", df[\"moisture\"].unique()) # Expected: [5, 7, 9]\n",
|
|
"print(\"Displacement Screw Setting values:\", df[\"displacement_screw_setting\"].unique()) # Expected: [0.22, 0.29, 0.36]\n",
|
|
"print(\"Spring Stiffness values:\", df[\"spring_stiffness\"].unique()) # Expected: [1800, 2000, 2200]\n",
|
|
"\n",
|
|
"# %% [markdown]\n",
|
|
"# ## Cell 8: Reshape Data to Long Format\n",
|
|
"# Convert the wide-format dataset to long format so that each row corresponds to one video (pecan) with its crack type.\n",
|
|
"\n",
|
|
"# %%\n",
|
|
"records = []\n",
|
|
"for idx, row in df.iterrows():\n",
|
|
" # Extract factor values for this run.\n",
|
|
" factors = {col: row[col] for col in factor_cols}\n",
|
|
" # For each crack outcome column, extract video IDs.\n",
|
|
" for col in crack_cols:\n",
|
|
" video_ids = extract_video_ids(row[col])\n",
|
|
" for vid in video_ids:\n",
|
|
" record = factors.copy()\n",
|
|
" record[\"video_id\"] = vid\n",
|
|
" record[\"crack_type\"] = col # Use the original crack category name.\n",
|
|
" records.append(record)\n",
|
|
"\n",
|
|
"df_long = pd.DataFrame(records)\n",
|
|
"print(\"Long-format DataFrame shape:\", df_long.shape)\n",
|
|
"df_long.head(10)\n",
|
|
"\n",
|
|
"# %% [markdown]\n",
|
|
"# ## Cell 9: Aggregate Long-Format Data into a Binary Format\n",
|
|
"# Aggregate the long-format data so that each unique video_id (with its factors) has binary indicators\n",
|
|
"# for each crack outcome. We use the pivot method with the 'values' parameter to preserve full column names.\n",
|
|
"\n",
|
|
"# %%\n",
|
|
"df_aggregated = df_long.pivot_table(index=[\"video_id\", \"moisture\", \"spring_stiffness\", \"displacement_screw_setting\", \"motor_speed\"], columns=\"crack_type\", values=\"crack_type\", aggfunc=lambda x: 1, fill_value=0).reset_index() # using the crack_type values themselves\n",
|
|
"\n",
|
|
"# Check the raw column names after pivoting\n",
|
|
"print(\"Raw aggregated columns:\", df_aggregated.columns.tolist())\n",
|
|
"\n",
|
|
"# %% [markdown]\n",
|
|
"# ## Cell 10: Reassign Factor Column Names and Rename Crack Outcome Columns\n",
|
|
"# We explicitly set the first 5 columns to our expected factor names, then rename the remaining crack columns\n",
|
|
"# using the following concise mapping:\n",
|
|
"#\n",
|
|
"# U : untouched\n",
|
|
"# L_1 : longitudinal_less_than_25%\n",
|
|
"# L_2 : longitudinal_between_25_50%\n",
|
|
"# L_3 : longitudinal_between_50_75%\n",
|
|
"# L_4 : longitudinal_more_than_75%\n",
|
|
"# C_1 : circumferential_less_than_25%\n",
|
|
"# C_2 : circumferential_between_25_50%\n",
|
|
"# C_3 : circumferential_between_50_75%\n",
|
|
"# C_4 : circumferential_more_than_75%\n",
|
|
"# O : open_crack\n",
|
|
"# X : crushed\n",
|
|
"\n",
|
|
"# %%\n",
|
|
"# Expected factor column names in order:\n",
|
|
"expected_factor_cols = [\"video_id\", \"moisture\", \"spring_stiffness\", \"displacement_screw_setting\", \"motor_speed\"]\n",
|
|
"\n",
|
|
"# Get current column names from df_aggregated\n",
|
|
"raw_cols = df_aggregated.columns.tolist()\n",
|
|
"\n",
|
|
"# Replace the first 5 columns with expected factor names:\n",
|
|
"for i in range(len(expected_factor_cols)):\n",
|
|
" raw_cols[i] = expected_factor_cols[i]\n",
|
|
"df_aggregated.columns = raw_cols\n",
|
|
"\n",
|
|
"print(\"After reassigning factor names:\")\n",
|
|
"print(df_aggregated.columns.tolist())\n",
|
|
"\n",
|
|
"# The remaining columns are the crack outcome columns (should be full names from df_long)\n",
|
|
"print(\"Crack outcome columns (raw):\", df_aggregated.columns[5:])\n",
|
|
"\n",
|
|
"# Define the renaming dictionary for crack outcomes:\n",
|
|
"rename_dict = {\"untouched\": \"U\", \"longitudinal_less_than_25%\": \"L_1\", \"longitudinal_between_25_50%\": \"L_2\", \"longitudinal_between_50_75%\": \"L_3\", \"longitudinal_more_than_75%\": \"L_4\", \"circumferential_less_than_25%\": \"C_1\", \"circumferential_between_25_50%\": \"C_2\", \"circumferential_between_50_75%\": \"C_3\", \"circumferential_more_than_75%\": \"C_4\", \"open_crack\": \"O\", \"crushed\": \"X\"}\n",
|
|
"\n",
|
|
"# Rename crack outcome columns from index 5 onward using our mapping:\n",
|
|
"new_crack_cols = [rename_dict.get(col, col) for col in df_aggregated.columns[5:]]\n",
|
|
"df_aggregated.columns = expected_factor_cols + new_crack_cols\n",
|
|
"\n",
|
|
"print(\"Final aggregated binary column names:\")\n",
|
|
"print(df_aggregated.columns.tolist())\n",
|
|
"\n",
|
|
"# %% [markdown]\n",
|
|
"# ## Cell 11: Save the Final Aggregated Binary Dataset\n",
|
|
"# Save the final aggregated binary dataset as a single CSV file.\n",
|
|
"\n",
|
|
"# %%\n",
|
|
"df_aggregated.to_csv(\"meyer_aggregated_binary_renamed.csv\", index=False)\n",
|
|
"print(\"Final renamed aggregated binary dataset saved as 'meyer_aggregated_binary_renamed.csv'.\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Reordered aggregated binary columns:\n",
|
|
"['video_id', 'moisture', 'spring_stiffness', 'displacement_screw_setting', 'motor_speed', 'U', 'L_1', 'L_2', 'L_3', 'L_4', 'C_1', 'C_2', 'C_3', 'C_4', 'O', 'X']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Desired order for crack outcome columns:\n",
|
|
"desired_order = [\"U\", \"L_1\", \"L_2\", \"L_3\", \"L_4\", \"C_1\", \"C_2\", \"C_3\", \"C_4\", \"O\", \"X\"]\n",
|
|
"\n",
|
|
"# The first 5 columns (factors) remain unchanged\n",
|
|
"factor_order = [\"video_id\", \"moisture\", \"spring_stiffness\", \"displacement_screw_setting\", \"motor_speed\"]\n",
|
|
"\n",
|
|
"# For the remaining columns, re-order based on desired_order.\n",
|
|
"# Create a list of current crack outcome columns from df_aggregated (assuming factor columns are the first 5).\n",
|
|
"current_crack_cols = df_aggregated.columns.tolist()[5:]\n",
|
|
"\n",
|
|
"# Now, force the order as desired. This assumes each desired name is present.\n",
|
|
"new_crack_cols = [col for col in desired_order if col in current_crack_cols]\n",
|
|
"\n",
|
|
"# Combine the factor columns with the newly ordered crack outcome columns.\n",
|
|
"new_column_order = factor_order + new_crack_cols\n",
|
|
"\n",
|
|
"# Reorder the DataFrame's columns.\n",
|
|
"df_aggregated = df_aggregated[new_column_order]\n",
|
|
"print(\"Reordered aggregated binary columns:\")\n",
|
|
"print(df_aggregated.columns.tolist())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Final renamed aggregated binary dataset saved as 'meyer_aggregated_binary_renamed.csv'.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"df_aggregated.to_csv(\"meyer_aggregated_binary_renamed.csv\", index=False)\n",
|
|
"print(\"Final renamed aggregated binary dataset saved as 'meyer_aggregated_binary_renamed.csv'.\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "pecan",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|