usda-cracking-analysis/analysis1.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initial DataFrame shape: (243, 15)\n"
     ]
    },
    {
     "data": {
      "application/vnd.microsoft.datawrangler.viewer.v0+json": {
       "columns": [
        {
         "name": "index",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "moisture",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "spring_stiffness ",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "displacement_screw_setting",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "motor_speed",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "untouched",
         "rawType": "object",
         "type": "unknown"
        },
        {
         "name": "longitudinal less than 25%",
         "rawType": "object",
         "type": "unknown"
        },
        {
         "name": "Longitudinal between 25-50%",
         "rawType": "object",
         "type": "unknown"
        },
        {
         "name": "Longitudinal between 50-75%",
         "rawType": "object",
         "type": "unknown"
        },
        {
         "name": "Longitudinal more than 75%",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "Circumferential less than 25%",
         "rawType": "object",
         "type": "unknown"
        },
        {
         "name": "Circumferential between 25-50%",
         "rawType": "object",
         "type": "unknown"
        },
        {
         "name": "Circumferential between 50-75%",
         "rawType": "object",
         "type": "unknown"
        },
        {
         "name": "Circumferential more than 75%",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "Open Crack",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "Crushed",
         "rawType": "object",
         "type": "unknown"
        }
       ],
       "conversionMethod": "pd.DataFrame",
       "ref": "12a85fd7-7b36-4ff2-bae9-05e2193cf380",
       "rows": [
        [
         "0",
         "5",
         "1800",
         "0.29",
         "60",
         null,
         null,
         null,
         null,
         "*GH013810, *GH013811, *GH013812, *GH013813, *GH013814, *GH013815",
         null,
         "*GH013812, *GH013813, *GH013814",
         null,
         "*GH013810, *GH013811, *GH013815",
         "*GH013810, *GH013811, *GH013812, *GH013813, *GH013814, *GH013815",
         null
        ],
        [
         "1",
         "5",
         "1800",
         "0.22",
         "45",
         null,
         null,
         null,
         null,
         "*GH013816, *GH013817, *GH013818, *GH013819, *GH013820, *GH013821",
         null,
         "*GH013818",
         "*GH013821",
         "*GH013816, *GH013817, *GH013819, *GH013820",
         "*GH013816, *GH013817, *GH013818, *GH013819, *GH013820, *GH013821",
         null
        ],
        [
         "2",
         "5",
         "1800",
         "0.36",
         "30",
         "*GH013822",
         null,
         null,
         "*GH013823",
         "*GH013824, *GH013825, *GH013826, *GH013827",
         "*GH013823, *GH013826",
         null,
         "*GH013824, *GH013827",
         "*GH013825",
         "*GH013823, *GH013824, *GH013825, *GH013827",
         null
        ],
        [
         "3",
         "5",
         "1800",
         "0.36",
         "60",
         "*GH013832",
         null,
         null,
         null,
         "*GH013828, *GH013829, *GH013830, *GH013831, *GH013833",
         "*GH013829",
         "*GH013830, *GH013833",
         "*GH013828",
         "*GH013831",
         "*GH013828, *GH013830, *GH013831, *GH013833",
         null
        ],
        [
         "4",
         "5",
         "1800",
         "0.22",
         "30",
         null,
         null,
         null,
         null,
         "*GH013834, *GH013835, *GH013836, *GH013837, *GH013838, *GH013839",
         null,
         null,
         "*GH013836",
         "*GH013834, *GH013835, *GH013837, *GH013838, *GH013839",
         "*GH013834, *GH013835, *GH013836, *GH013837, *GH013838, *GH013839",
         null
        ]
       ],
       "shape": {
        "columns": 15,
        "rows": 5
       }
      },
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>moisture</th>\n",
       "      <th>spring_stiffness</th>\n",
       "      <th>displacement_screw_setting</th>\n",
       "      <th>motor_speed</th>\n",
       "      <th>untouched</th>\n",
       "      <th>longitudinal less than 25%</th>\n",
       "      <th>Longitudinal between 25-50%</th>\n",
       "      <th>Longitudinal between 50-75%</th>\n",
       "      <th>Longitudinal more than 75%</th>\n",
       "      <th>Circumferential less than 25%</th>\n",
       "      <th>Circumferential between 25-50%</th>\n",
       "      <th>Circumferential between 50-75%</th>\n",
       "      <th>Circumferential more than 75%</th>\n",
       "      <th>Open Crack</th>\n",
       "      <th>Crushed</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "      <td>1800</td>\n",
       "      <td>0.29</td>\n",
       "      <td>60</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>*GH013810, *GH013811, *GH013812, *GH013813, *G...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>*GH013812, *GH013813, *GH013814</td>\n",
       "      <td>NaN</td>\n",
       "      <td>*GH013810, *GH013811, *GH013815</td>\n",
       "      <td>*GH013810, *GH013811, *GH013812, *GH013813, *G...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>1800</td>\n",
       "      <td>0.22</td>\n",
       "      <td>45</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>*GH013816, *GH013817, *GH013818, *GH013819, *G...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>*GH013818</td>\n",
       "      <td>*GH013821</td>\n",
       "      <td>*GH013816, *GH013817, *GH013819, *GH013820</td>\n",
       "      <td>*GH013816, *GH013817, *GH013818, *GH013819, *G...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5</td>\n",
       "      <td>1800</td>\n",
       "      <td>0.36</td>\n",
       "      <td>30</td>\n",
       "      <td>*GH013822</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>*GH013823</td>\n",
       "      <td>*GH013824, *GH013825, *GH013826, *GH013827</td>\n",
       "      <td>*GH013823, *GH013826</td>\n",
       "      <td>NaN</td>\n",
       "      <td>*GH013824, *GH013827</td>\n",
       "      <td>*GH013825</td>\n",
       "      <td>*GH013823, *GH013824, *GH013825, *GH013827</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5</td>\n",
       "      <td>1800</td>\n",
       "      <td>0.36</td>\n",
       "      <td>60</td>\n",
       "      <td>*GH013832</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>*GH013828, *GH013829, *GH013830, *GH013831, *G...</td>\n",
       "      <td>*GH013829</td>\n",
       "      <td>*GH013830, *GH013833</td>\n",
       "      <td>*GH013828</td>\n",
       "      <td>*GH013831</td>\n",
       "      <td>*GH013828, *GH013830, *GH013831, *GH013833</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>1800</td>\n",
       "      <td>0.22</td>\n",
       "      <td>30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>*GH013834, *GH013835, *GH013836, *GH013837, *G...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>*GH013836</td>\n",
       "      <td>*GH013834, *GH013835, *GH013837, *GH013838, *G...</td>\n",
       "      <td>*GH013834, *GH013835, *GH013836, *GH013837, *G...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   moisture  spring_stiffness   displacement_screw_setting  motor_speed  \\\n",
       "0         5               1800                        0.29           60   \n",
       "1         5               1800                        0.22           45   \n",
       "2         5               1800                        0.36           30   \n",
       "3         5               1800                        0.36           60   \n",
       "4         5               1800                        0.22           30   \n",
       "\n",
       "   untouched longitudinal less than 25% Longitudinal between 25-50%  \\\n",
       "0        NaN                        NaN                         NaN   \n",
       "1        NaN                        NaN                         NaN   \n",
       "2  *GH013822                        NaN                         NaN   \n",
       "3  *GH013832                        NaN                         NaN   \n",
       "4        NaN                        NaN                         NaN   \n",
       "\n",
       "  Longitudinal between 50-75%  \\\n",
       "0                         NaN   \n",
       "1                         NaN   \n",
       "2                   *GH013823   \n",
       "3                         NaN   \n",
       "4                         NaN   \n",
       "\n",
       "                          Longitudinal more than 75%  \\\n",
       "0  *GH013810, *GH013811, *GH013812, *GH013813, *G...   \n",
       "1  *GH013816, *GH013817, *GH013818, *GH013819, *G...   \n",
       "2         *GH013824, *GH013825, *GH013826, *GH013827   \n",
       "3  *GH013828, *GH013829, *GH013830, *GH013831, *G...   \n",
       "4  *GH013834, *GH013835, *GH013836, *GH013837, *G...   \n",
       "\n",
       "  Circumferential less than 25%   Circumferential between 25-50%  \\\n",
       "0                           NaN  *GH013812, *GH013813, *GH013814   \n",
       "1                           NaN                        *GH013818   \n",
       "2          *GH013823, *GH013826                              NaN   \n",
       "3                     *GH013829             *GH013830, *GH013833   \n",
       "4                           NaN                              NaN   \n",
       "\n",
       "  Circumferential between 50-75%  \\\n",
       "0                            NaN   \n",
       "1                      *GH013821   \n",
       "2           *GH013824, *GH013827   \n",
       "3                      *GH013828   \n",
       "4                      *GH013836   \n",
       "\n",
       "                       Circumferential more than 75%  \\\n",
       "0                    *GH013810, *GH013811, *GH013815   \n",
       "1         *GH013816, *GH013817, *GH013819, *GH013820   \n",
       "2                                          *GH013825   \n",
       "3                                          *GH013831   \n",
       "4  *GH013834, *GH013835, *GH013837, *GH013838, *G...   \n",
       "\n",
       "                                          Open Crack Crushed  \n",
       "0  *GH013810, *GH013811, *GH013812, *GH013813, *G...     NaN  \n",
       "1  *GH013816, *GH013817, *GH013818, *GH013819, *G...     NaN  \n",
       "2         *GH013823, *GH013824, *GH013825, *GH013827     NaN  \n",
       "3         *GH013828, *GH013830, *GH013831, *GH013833     NaN  \n",
       "4  *GH013834, *GH013835, *GH013836, *GH013837, *G...     NaN  "
      ]
     },
     "execution_count": 126,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Import necessary libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# Load the raw dataset\n",
    "df = pd.read_csv(\"meyer.csv\")\n",
    "print(\"Initial DataFrame shape:\", df.shape)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Renamed columns: ['moisture', 'spring_stiffness', 'displacement_screw_setting', 'motor_speed', 'untouched', 'longitudinal_less_than_25%', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_more_than_75%', 'circumferential_less_than_25%', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_more_than_75%', 'open_crack', 'crushed']\n"
     ]
    }
   ],
   "source": [
    "# Standardize column names\n",
    "df.columns = df.columns.str.lower().str.strip().str.replace(\" \", \"_\").str.replace(\"-\", \"_\")\n",
    "print(\"Renamed columns:\", df.columns.tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Missing values in factor columns:\n",
      "moisture                      0\n",
      "spring_stiffness              0\n",
      "displacement_screw_setting    0\n",
      "motor_speed                   0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# Define factor columns and convert them to numeric\n",
    "factor_cols = [\"moisture\", \"spring_stiffness\", \"displacement_screw_setting\", \"motor_speed\"]\n",
    "for col in factor_cols:\n",
    "    df[col] = pd.to_numeric(df[col], errors=\"coerce\")\n",
    "print(\"Missing values in factor columns:\")\n",
    "print(df[factor_cols].isnull().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Crack columns identified: ['untouched', 'longitudinal_less_than_25%', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_more_than_75%', 'circumferential_less_than_25%', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_more_than_75%', 'open_crack', 'crushed']\n"
     ]
    }
   ],
   "source": [
    "# Identify crack outcome columns\n",
    "crack_cols = [c for c in df.columns if c not in factor_cols]\n",
    "print(\"Crack columns identified:\", crack_cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define functions to extract video IDs from a cell and count unique video IDs across crack columns.\n",
    "def extract_video_ids(cell_value: str) -> list:\n",
    "    if pd.isna(cell_value):\n",
    "        return []\n",
    "    return [x.strip() for x in cell_value.split(\",\") if x.strip() != \"\"]\n",
    "\n",
    "\n",
    "def count_unique_videos(row: pd.Series) -> int:\n",
    "    all_ids = []\n",
    "    for col in crack_cols:\n",
    "        all_ids.extend(extract_video_ids(row[col]))\n",
    "    return len(set(all_ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   video_count  missing_videos_flag\n",
      "0            6                False\n",
      "1            6                False\n",
      "2            6                False\n",
      "3            6                False\n",
      "4            6                False\n",
      "5            5                 True\n",
      "6            6                False\n",
      "7            6                False\n",
      "8            6                False\n",
      "9            6                False\n",
      "Number of runs with fewer than 6 videos: 16\n"
     ]
    }
   ],
   "source": [
    "# Compute unique video counts for each row and flag incomplete runs\n",
    "df[\"video_count\"] = df.apply(count_unique_videos, axis=1)\n",
    "df[\"missing_videos_flag\"] = df[\"video_count\"] < 6\n",
    "print(df[[\"video_count\", \"missing_videos_flag\"]].head(10))\n",
    "print(f\"Number of runs with fewer than 6 videos: {df['missing_videos_flag'].sum()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Motor Speed values: [60 45 30]\n",
      "Moisture values: [5 7 9]\n",
      "Displacement Screw Setting values: [0.29 0.22 0.36]\n",
      "Spring Stiffness values: [1800 2000 2200]\n"
     ]
    }
   ],
   "source": [
    "# Verify that factor columns contain the expected values.\n",
    "\n",
    "# %%\n",
    "print(\"Motor Speed values:\", df[\"motor_speed\"].unique())  # Expected: [30, 45, 60]\n",
    "print(\"Moisture values:\", df[\"moisture\"].unique())  # Expected: [5, 7, 9]\n",
    "print(\"Displacement Screw Setting values:\", df[\"displacement_screw_setting\"].unique())  # Expected: [0.22, 0.29, 0.36]\n",
    "print(\"Spring Stiffness values:\", df[\"spring_stiffness\"].unique())  # Expected: [1800, 2000, 2200]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Long-format DataFrame shape: (3769, 6)\n"
     ]
    },
    {
     "data": {
      "application/vnd.microsoft.datawrangler.viewer.v0+json": {
       "columns": [
        {
         "name": "index",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "moisture",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "spring_stiffness",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "displacement_screw_setting",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "motor_speed",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "video_id",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "crack_type",
         "rawType": "object",
         "type": "string"
        }
       ],
       "conversionMethod": "pd.DataFrame",
       "ref": "66f7239c-e784-429e-914f-e3d50900eb0a",
       "rows": [
        [
         "0",
         "5",
         "1800",
         "0.29",
         "60",
         "*GH013810",
         "longitudinal_more_than_75%"
        ],
        [
         "1",
         "5",
         "1800",
         "0.29",
         "60",
         "*GH013811",
         "longitudinal_more_than_75%"
        ],
        [
         "2",
         "5",
         "1800",
         "0.29",
         "60",
         "*GH013812",
         "longitudinal_more_than_75%"
        ],
        [
         "3",
         "5",
         "1800",
         "0.29",
         "60",
         "*GH013813",
         "longitudinal_more_than_75%"
        ],
        [
         "4",
         "5",
         "1800",
         "0.29",
         "60",
         "*GH013814",
         "longitudinal_more_than_75%"
        ],
        [
         "5",
         "5",
         "1800",
         "0.29",
         "60",
         "*GH013815",
         "longitudinal_more_than_75%"
        ],
        [
         "6",
         "5",
         "1800",
         "0.29",
         "60",
         "*GH013812",
         "circumferential_between_25_50%"
        ],
        [
         "7",
         "5",
         "1800",
         "0.29",
         "60",
         "*GH013813",
         "circumferential_between_25_50%"
        ],
        [
         "8",
         "5",
         "1800",
         "0.29",
         "60",
         "*GH013814",
         "circumferential_between_25_50%"
        ],
        [
         "9",
         "5",
         "1800",
         "0.29",
         "60",
         "*GH013810",
         "circumferential_more_than_75%"
        ]
       ],
       "shape": {
        "columns": 6,
        "rows": 10
       }
      },
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>moisture</th>\n",
       "      <th>spring_stiffness</th>\n",
       "      <th>displacement_screw_setting</th>\n",
       "      <th>motor_speed</th>\n",
       "      <th>video_id</th>\n",
       "      <th>crack_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "      <td>1800</td>\n",
       "      <td>0.29</td>\n",
       "      <td>60</td>\n",
       "      <td>*GH013810</td>\n",
       "      <td>longitudinal_more_than_75%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>1800</td>\n",
       "      <td>0.29</td>\n",
       "      <td>60</td>\n",
       "      <td>*GH013811</td>\n",
       "      <td>longitudinal_more_than_75%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5</td>\n",
       "      <td>1800</td>\n",
       "      <td>0.29</td>\n",
       "      <td>60</td>\n",
       "      <td>*GH013812</td>\n",
       "      <td>longitudinal_more_than_75%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5</td>\n",
       "      <td>1800</td>\n",
       "      <td>0.29</td>\n",
       "      <td>60</td>\n",
       "      <td>*GH013813</td>\n",
       "      <td>longitudinal_more_than_75%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>1800</td>\n",
       "      <td>0.29</td>\n",
       "      <td>60</td>\n",
       "      <td>*GH013814</td>\n",
       "      <td>longitudinal_more_than_75%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5</td>\n",
       "      <td>1800</td>\n",
       "      <td>0.29</td>\n",
       "      <td>60</td>\n",
       "      <td>*GH013815</td>\n",
       "      <td>longitudinal_more_than_75%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>5</td>\n",
       "      <td>1800</td>\n",
       "      <td>0.29</td>\n",
       "      <td>60</td>\n",
       "      <td>*GH013812</td>\n",
       "      <td>circumferential_between_25_50%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>5</td>\n",
       "      <td>1800</td>\n",
       "      <td>0.29</td>\n",
       "      <td>60</td>\n",
       "      <td>*GH013813</td>\n",
       "      <td>circumferential_between_25_50%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>5</td>\n",
       "      <td>1800</td>\n",
       "      <td>0.29</td>\n",
       "      <td>60</td>\n",
       "      <td>*GH013814</td>\n",
       "      <td>circumferential_between_25_50%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>5</td>\n",
       "      <td>1800</td>\n",
       "      <td>0.29</td>\n",
       "      <td>60</td>\n",
       "      <td>*GH013810</td>\n",
       "      <td>circumferential_more_than_75%</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   moisture  spring_stiffness  displacement_screw_setting  motor_speed  \\\n",
       "0         5              1800                        0.29           60   \n",
       "1         5              1800                        0.29           60   \n",
       "2         5              1800                        0.29           60   \n",
       "3         5              1800                        0.29           60   \n",
       "4         5              1800                        0.29           60   \n",
       "5         5              1800                        0.29           60   \n",
       "6         5              1800                        0.29           60   \n",
       "7         5              1800                        0.29           60   \n",
       "8         5              1800                        0.29           60   \n",
       "9         5              1800                        0.29           60   \n",
       "\n",
       "    video_id                      crack_type  \n",
       "0  *GH013810      longitudinal_more_than_75%  \n",
       "1  *GH013811      longitudinal_more_than_75%  \n",
       "2  *GH013812      longitudinal_more_than_75%  \n",
       "3  *GH013813      longitudinal_more_than_75%  \n",
       "4  *GH013814      longitudinal_more_than_75%  \n",
       "5  *GH013815      longitudinal_more_than_75%  \n",
       "6  *GH013812  circumferential_between_25_50%  \n",
       "7  *GH013813  circumferential_between_25_50%  \n",
       "8  *GH013814  circumferential_between_25_50%  \n",
       "9  *GH013810   circumferential_more_than_75%  "
      ]
     },
     "execution_count": 133,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create long-format records: one row per video ID with its corresponding factor values and crack type.\n",
    "records = []\n",
    "for idx, row in df.iterrows():\n",
    "    # Extract factor values for the run\n",
    "    factors = {col: row[col] for col in factor_cols}\n",
    "    # Iterate through each crack outcome column and extract video IDs\n",
    "    for col in crack_cols:\n",
    "        video_ids = extract_video_ids(row[col])\n",
    "        for vid in video_ids:\n",
    "            record = factors.copy()\n",
    "            record[\"video_id\"] = vid\n",
    "            record[\"crack_type\"] = col  # Original crack category name\n",
    "            records.append(record)\n",
    "\n",
    "df_long = pd.DataFrame(records)\n",
    "print(\"Long-format DataFrame shape:\", df_long.shape)\n",
    "df_long.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Raw aggregated columns: ['video_id', 'moisture', 'spring_stiffness', 'displacement_screw_setting', 'motor_speed', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_less_than_25%', 'circumferential_more_than_75%', 'crushed', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_less_than_25%', 'longitudinal_more_than_75%', 'open_crack', 'untouched']\n"
     ]
    }
   ],
   "source": [
    "# Pivot the long-format data with 'video_id' and factors as the index, and crack_type as columns.\n",
    "df_aggregated = df_long.pivot_table(index=[\"video_id\", \"moisture\", \"spring_stiffness\", \"displacement_screw_setting\", \"motor_speed\"], columns=\"crack_type\", values=\"crack_type\", aggfunc=lambda x: 1, fill_value=0).reset_index()\n",
    "\n",
    "# Print raw aggregated column names\n",
    "print(\"Raw aggregated columns:\", df_aggregated.columns.tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "After reassigning factor names:\n",
      "['video', 'moisture', 'spring', 'displacement', 'motor', 'circumferential_between_25_50%', 'circumferential_between_50_75%', 'circumferential_less_than_25%', 'circumferential_more_than_75%', 'crushed', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%', 'longitudinal_less_than_25%', 'longitudinal_more_than_75%', 'open_crack', 'untouched']\n",
      "Raw crack outcome columns: Index(['circumferential_between_25_50%', 'circumferential_between_50_75%',\n",
      "       'circumferential_less_than_25%', 'circumferential_more_than_75%',\n",
      "       'crushed', 'longitudinal_between_25_50%', 'longitudinal_between_50_75%',\n",
      "       'longitudinal_less_than_25%', 'longitudinal_more_than_75%',\n",
      "       'open_crack', 'untouched'],\n",
      "      dtype='object')\n",
      "Columns after renaming crack outcomes:\n",
      "['video', 'moisture', 'spring', 'displacement', 'motor', 'C_2', 'C_3', 'C_1', 'C_4', 'X', 'L_2', 'L_3', 'L_1', 'L_4', 'O', 'U']\n"
     ]
    }
   ],
   "source": [
    "# Expected factor names:\n",
    "expected_factor_cols = [\"video\", \"moisture\", \"spring\", \"displacement\", \"motor\"]\n",
    "\n",
    "# Get current aggregated columns as a list\n",
    "current_cols = df_aggregated.columns.tolist()\n",
    "\n",
    "# Replace the first five columns with our desired factor names.\n",
    "for i in range(len(expected_factor_cols)):\n",
    "    current_cols[i] = expected_factor_cols[i]\n",
    "df_aggregated.columns = current_cols\n",
    "print(\"After reassigning factor names:\")\n",
    "print(df_aggregated.columns.tolist())\n",
    "\n",
    "# Print remaining crack outcome columns (raw)\n",
    "print(\"Raw crack outcome columns:\", df_aggregated.columns[5:])\n",
    "\n",
    "# Define renaming dictionary for crack outcomes:\n",
    "rename_dict = {\"untouched\": \"U\", \"longitudinal_less_than_25%\": \"L_1\", \"longitudinal_between_25_50%\": \"L_2\", \"longitudinal_between_50_75%\": \"L_3\", \"longitudinal_more_than_75%\": \"L_4\", \"circumferential_less_than_25%\": \"C_1\", \"circumferential_between_25_50%\": \"C_2\", \"circumferential_between_50_75%\": \"C_3\", \"circumferential_more_than_75%\": \"C_4\", \"open_crack\": \"O\", \"crushed\": \"X\"}\n",
    "\n",
    "# Rename crack outcome columns (columns 6 onward)\n",
    "new_crack_cols = [rename_dict.get(col, col) for col in df_aggregated.columns[5:]]\n",
    "# Combine factor columns with renamed crack outcome columns\n",
    "df_aggregated.columns = expected_factor_cols + new_crack_cols\n",
    "print(\"Columns after renaming crack outcomes:\")\n",
    "print(df_aggregated.columns.tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Final aggregated binary column order:\n",
      "['video', 'moisture', 'spring', 'displacement', 'motor', 'U', 'L_1', 'L_2', 'L_3', 'L_4', 'C_1', 'C_2', 'C_3', 'C_4', 'O', 'X']\n"
     ]
    }
   ],
   "source": [
    "# Define desired order for crack outcome columns\n",
    "desired_order = [\"U\", \"L_1\", \"L_2\", \"L_3\", \"L_4\", \"C_1\", \"C_2\", \"C_3\", \"C_4\", \"O\", \"X\"]\n",
    "# Factor columns remain as defined\n",
    "factor_order = expected_factor_cols\n",
    "# Extract current crack outcome columns (from index 5 onward)\n",
    "current_crack_cols = df_aggregated.columns.tolist()[5:]\n",
    "# Reorder crack outcome columns based on desired order (only include those present)\n",
    "new_crack_cols_ordered = [col for col in desired_order if col in current_crack_cols]\n",
    "# Combine factor columns with the newly ordered crack outcome columns\n",
    "new_column_order = factor_order + new_crack_cols_ordered\n",
    "df_aggregated = df_aggregated[new_column_order]\n",
    "print(\"Final aggregated binary column order:\")\n",
    "print(df_aggregated.columns.tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Final renamed aggregated binary dataset saved as 'meyer_aggregated_binary_renamed.csv'.\n"
     ]
    }
   ],
   "source": [
    "# Save the final aggregated binary dataset with the desired column names to a single CSV file.\n",
    "df_aggregated.to_csv(\"meyer_aggregated_binary_renamed.csv\", index=False)\n",
    "print(\"Final renamed aggregated binary dataset saved as 'meyer_aggregated_binary_renamed.csv'.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pecan",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}