Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion case-study/3_Model.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"id": "4a4bd179-fa92-49c0-ae9c-ab1122bf01c5",
"metadata": {},
"outputs": [],
"source": "import pandas as pd\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.decomposition import PCA\nfrom sklearn.cluster import KMeans\nfrom sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nDATA = '../data/case-study/processed'\n\n# load and merge\nsession_dfs = []\nfor s in [1, 2, 3]:\n hr = pd.read_csv(f'{DATA}/hr_{s:02d}.csv')\n ibi = pd.read_csv(f'{DATA}/ibi_{s:02d}.csv')\n sed = pd.read_csv(f'{DATA}/sed_{s:02d}.csv')\n\n hr_clean = hr[hr['confidence'] == 1.0]\n ibi_clean = ibi[ibi['ibi'] > 0]\n\n hr_agg = hr_clean.groupby('datetime')['heart_rate'].mean().reset_index()\n ibi_agg = ibi_clean.groupby('datetime')['ibi'].mean().reset_index()\n\n merged = pd.merge(hr_agg, ibi_agg, on='datetime')\n # eye-tracking ('sed') is ~60 Hz vs ~1 Hz for HR/IBI, so the streams rarely share an\n # exact timestamp; align on nearest time within tolerance (an exact inner join drops ~93% of rows)\n _fmt = '%Y/%m/%d %H:%M:%S.%f'\n merged['datetime'] = pd.to_datetime(merged['datetime'], format=_fmt)\n sed['datetime'] = pd.to_datetime(sed['datetime'], format=_fmt)\n merged = merged.sort_values('datetime')\n sed = sed.sort_values('datetime')\n merged = pd.merge_asof(merged, sed, on='datetime', direction='nearest', tolerance=pd.Timedelta('100ms'))\n session_dfs.append(merged)\n\nall_data_combined = pd.concat(session_dfs, ignore_index=True)\n\nprint(\"Columns available in DataFrame:\", all_data_combined.columns)\n\n# define features\nfeatures = [\n 'heart_rate', 'ibi', 'headPos.x', 'headPos.y', 'headPos.z',\n 'gazeDir.x', 'gazeDir.y', 'gazeDir.z', 'pupil'\n]\n\nexisting_features = [f for f in features if f in all_data_combined.columns]\n\nmissing_features = set(features) - set(existing_features)\nif missing_features:\n print(f\"Missing columns in the DataFrame: {missing_features}\")\n\nall_data_combined = all_data_combined.dropna(subset=existing_features)\n\n# standardize features\nscaler = StandardScaler()\nX_scaled = scaler.fit_transform(all_data_combined[existing_features])\n\n# pca for visualization\npca = PCA(n_components=2)\nX_pca = pca.fit_transform(X_scaled)\nprint(f\"PCA explained variance: {pca.explained_variance_ratio_.sum():.1%}\")\n\n# find optimal k\nsilhouette_scores = []\nK = range(2, 11)\nfitted_models = {}\nfor k in K:\n km = KMeans(n_clusters=k, n_init=10, random_state=42)\n km.fit(X_scaled)\n fitted_models[k] = km\n silhouette_scores.append(silhouette_score(X_scaled, km.labels_))\n\nplt.figure(figsize=(10, 6))\nplt.plot(K, silhouette_scores, marker='o')\nplt.xlabel('Number of clusters')\nplt.ylabel('Silhouette Score')\nplt.title('Silhouette Score vs. Number of Clusters')\nplt.show()\nplt.close()\n\n# apply optimal k\noptimal_k = list(K)[np.argmax(silhouette_scores)]\nprint(f\"Optimal k selected: {optimal_k} (silhouette score: {max(silhouette_scores):.4f})\")\n\nkmeans = fitted_models[optimal_k]\nclusters = kmeans.labels_\n\n# pca cluster plot\nplt.figure(figsize=(10, 6))\nplt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='tab10', marker='o')\nplt.xlabel('PCA Component 1')\nplt.ylabel('PCA Component 2')\nplt.title(f'K-Means Clusters (k={optimal_k}) in PCA-Reduced Space')\nplt.colorbar(label='Cluster')\nplt.show()\nplt.close()\n\n# quality metrics\ncluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)\ncluster_sizes = pd.Series(clusters).value_counts()\n\ncluster_df = pd.DataFrame(cluster_centers, columns=existing_features).round(3)\nprint(\"Cluster Centers:\")\nprint(cluster_df.to_string())\nprint(\"\\nCluster Sizes:\\n\", cluster_sizes)\n\ndbi = davies_bouldin_score(X_scaled, clusters)\nchi = calinski_harabasz_score(X_scaled, clusters)\n\nprint(f\"Davies-Bouldin Index: {dbi}\")\nprint(f\"Calinski-Harabasz Index: {chi}\")"
"source": "import pandas as pd\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.decomposition import PCA\nfrom sklearn.cluster import KMeans\nfrom sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nDATA = '../data/case-study/processed'\n\n# load and merge\nsession_dfs = []\nfor s in [1, 2, 3]:\n hr = pd.read_csv(f'{DATA}/hr_{s:02d}.csv')\n ibi = pd.read_csv(f'{DATA}/ibi_{s:02d}.csv')\n sed = pd.read_csv(f'{DATA}/sed_{s:02d}.csv')\n\n hr_clean = hr[hr['confidence'] == 1.0]\n ibi_clean = ibi[ibi['ibi'] > 0]\n\n hr_agg = hr_clean.groupby('datetime')['heart_rate'].mean().reset_index()\n ibi_agg = ibi_clean.groupby('datetime')['ibi'].mean().reset_index()\n\n merged = pd.merge(hr_agg, ibi_agg, on='datetime')\n # eye-tracking ('sed') is ~60 Hz vs ~1 Hz for HR/IBI, so the streams rarely share an\n # exact timestamp; align on nearest time within tolerance (an exact inner join drops ~93% of rows)\n _fmt = '%Y/%m/%d %H:%M:%S.%f'\n merged['datetime'] = pd.to_datetime(merged['datetime'], format=_fmt)\n sed['datetime'] = pd.to_datetime(sed['datetime'], format=_fmt)\n merged = merged.sort_values('datetime')\n sed = sed.sort_values('datetime')\n merged = pd.merge_asof(merged, sed, on='datetime', direction='nearest', tolerance=pd.Timedelta('100ms'))\n session_dfs.append(merged)\n\nall_data_combined = pd.concat(session_dfs, ignore_index=True)\n\nprint(\"Columns available in DataFrame:\", all_data_combined.columns)\n\n# define features\nfeatures = [\n 'heart_rate', 'ibi', 'headPos.x', 'headPos.y', 'headPos.z',\n 'gazeDir.x', 'gazeDir.y', 'gazeDir.z', 'pupil'\n]\n\nexisting_features = [f for f in features if f in all_data_combined.columns]\n\n# drop constant (zero-variance) columns - they carry no clustering signal (e.g. headPos.* are all-zero)\nconstant_features = [f for f in existing_features if all_data_combined[f].nunique(dropna=True) <= 1]\nif constant_features:\n print(f'Dropping constant columns: {constant_features}')\n existing_features = [f for f in existing_features if f not in constant_features]\n\nmissing_features = set(features) - set(existing_features)\nif missing_features:\n print(f\"Missing columns in the DataFrame: {missing_features}\")\n\nall_data_combined = all_data_combined.dropna(subset=existing_features)\n\n# standardize features\nscaler = StandardScaler()\nX_scaled = scaler.fit_transform(all_data_combined[existing_features])\n\n# pca for visualization\npca = PCA(n_components=2)\nX_pca = pca.fit_transform(X_scaled)\nprint(f\"PCA explained variance: {pca.explained_variance_ratio_.sum():.1%}\")\n\n# find optimal k\nsilhouette_scores = []\nK = range(2, 11)\nfitted_models = {}\nfor k in K:\n km = KMeans(n_clusters=k, n_init=10, random_state=42)\n km.fit(X_scaled)\n fitted_models[k] = km\n silhouette_scores.append(silhouette_score(X_scaled, km.labels_))\n\nplt.figure(figsize=(10, 6))\nplt.plot(K, silhouette_scores, marker='o')\nplt.xlabel('Number of clusters')\nplt.ylabel('Silhouette Score')\nplt.title('Silhouette Score vs. Number of Clusters')\nplt.show()\nplt.close()\n\n# apply optimal k\noptimal_k = list(K)[np.argmax(silhouette_scores)]\nprint(f\"Optimal k selected: {optimal_k} (silhouette score: {max(silhouette_scores):.4f})\")\n\nkmeans = fitted_models[optimal_k]\nclusters = kmeans.labels_\n\n# pca cluster plot\nplt.figure(figsize=(10, 6))\nplt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='tab10', marker='o')\nplt.xlabel('PCA Component 1')\nplt.ylabel('PCA Component 2')\nplt.title(f'K-Means Clusters (k={optimal_k}) in PCA-Reduced Space')\nplt.colorbar(label='Cluster')\nplt.show()\nplt.close()\n\n# quality metrics\ncluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)\ncluster_sizes = pd.Series(clusters).value_counts()\n\ncluster_df = pd.DataFrame(cluster_centers, columns=existing_features).round(3)\nprint(\"Cluster Centers:\")\nprint(cluster_df.to_string())\nprint(\"\\nCluster Sizes:\\n\", cluster_sizes)\n\ndbi = davies_bouldin_score(X_scaled, clusters)\nchi = calinski_harabasz_score(X_scaled, clusters)\n\nprint(f\"Davies-Bouldin Index: {dbi}\")\nprint(f\"Calinski-Harabasz Index: {chi}\")"
}
],
"metadata": {
Expand Down
Loading