import pandas as pd
import numpy as np
# Load the raw data (replace 'raw_data.csv' with your file)
raw_data = pd.read_csv('raw_data.csv')
# Task 1: Handling Missing Values
# Option 1: Remove rows with missing values
clean_data = raw_data.dropna()
# Option 2: Fill missing values with a specific value (e.g., mean, median)
# Example: Fill missing numerical values with mean
clean_data['numerical_column'].fillna(clean_data['numerical_column'].mean(), inplace=True)
# Task 2: Removing Duplicates
clean_data = clean_data.drop_duplicates()
# Task 3: Handling Outliers
# Example: Remove rows with values outside a specific range
clean_data = clean_data[(clean_data['numerical_column'] >= lower_bound) & (clean_data['numerical_column'] <= upper_bound)]
# Task 4: Standardizing Text Data
# Example: Convert text to lowercase
clean_data['text_column'] = clean_data['text_column'].str.lower()
# Task 5: Removing Special Characters and Punctuation
# Example: Remove all non-alphanumeric characters
clean_data['text_column'] = clean_data['text_column'].str.replace('[^a-zA-Z0-9]', ' ', regex=True)
# Task 6: Handling Date and Time Data
# Example: Convert string to datetime
clean_data['date_column'] = pd.to_datetime(clean_data['date_column'])
# Task 7: Data Type Conversion
# Example: Convert a column to a different data type
clean_data['numeric_column'] = clean_data['numeric_column'].astype(float)
# Task 8: Scaling or Normalizing Numerical Data
# Example: Min-Max Scaling
clean_data['numeric_column'] = (clean_data['numeric_column'] - clean_data['numeric_column'].min()) / (clean_data['numeric_column'].max() - clean_data['numeric_column'].min())
# Task 9: Feature Engineering (Creating new features)
# Example: Calculate a new feature based on existing columns
clean_data['new_feature'] = clean_data['feature1'] + clean_data['feature2']
# Task 10: Handling Categorical Data (e.g., One-Hot Encoding)
# Example: Convert categorical column to one-hot encoding
clean_data = pd.get_dummies(clean_data, columns=['categorical_column'])
# Save the cleaned data to a new file (replace 'cleaned_data.csv' with your desired file name)
clean_data.to_csv('cleaned_data.csv', index=False)
留言