https://colab.research.google.com/drive/19z81eKDr9AUWWke38V5RilcdRecRINeo?usp=sharing
import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
Directory where your CSV files are located
directory = 'drive/My Drive/StO2_mat(size513_911)/'
Initialize empty lists to store data and file names
data_arrays = []
file_names = []
Loop through all CSV files in the directory
for filename in os.listdir(directory):
if filename.endswith('.csv'):
file_path = os.path.join(directory, filename)
df = pd.read_csv(file_path)
data_array = df.values.ravel()
data_arrays.append(data_array)
file_names.append(filename)
Create a DataFrame from the list of 1D NumPy arrays
data = pd.DataFrame(data_arrays)
Add a "target column" containing the original file names
data['target_column'] = file_names
Check if there are enough unique samples for splitting
if len(data['target_column'].unique()) <= 1:
print("Not enough unique samples for train-test split.")
else:
# Separate non-numeric and numeric data columns
non_numeric_data = data.select_dtypes('string')
numeric_data = data.select_dtypes(include=['number'])
# Impute missing values in numeric data
imputer = SimpleImputer(strategy='mean')
numeric_data_imputed = imputer.fit_transform(numeric_data)
numeric_data_imputed_df = pd.DataFrame(numeric_data_imputed)
# Combine non-numeric and imputed numeric data
imputed_data = pd.concat([non_numeric_data, numeric_data_imputed_df], axis=1)
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(imputed_data.drop('target_column', axis=1), imputed_data['target_column'], test_size=0.2, random_state=42)
# Train the model on the training data
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)
# Make predictions on the test data
y_pred = clf.predict(X_test)
# Evaluate the model performance
accuracy = np.mean(y_pred == y_test)
print('Accuracy:', accuracy)
the error I am getting is following:-
KeyError Traceback (most recent call last)
in ()
45
46 # Split the data into training and test sets
---> 47 X_train, X_test, y_train, y_test = train_test_split(imputed_data.drop('target_column', axis=1), imputed_data['target_column'], test_size=0.2, random_state=42)
48
49 # Train the model on the training data
5 frames
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in drop(self, labels, errors)
6932 if mask.any():
6933 if errors != "ignore":
-> 6934 raise KeyError(f"{list(labels[mask])} not found in axis")
6935 indexer = indexer[~mask]
6936 return self.delete(indexer)
KeyError: "['target_column'] not found in axis"
Top comments (0)