top of page
Writer's pictureAbhinandan Borse

python general cleaning raw data code

import pandas as pd

import numpy as np


# Load the raw data (replace 'raw_data.csv' with your file)

raw_data = pd.read_csv('raw_data.csv')


# Task 1: Handling Missing Values

# Option 1: Remove rows with missing values

clean_data = raw_data.dropna()


# Option 2: Fill missing values with a specific value (e.g., mean, median)

# Example: Fill missing numerical values with mean

clean_data['numerical_column'].fillna(clean_data['numerical_column'].mean(), inplace=True)


# Task 2: Removing Duplicates

clean_data = clean_data.drop_duplicates()


# Task 3: Handling Outliers

# Example: Remove rows with values outside a specific range

clean_data = clean_data[(clean_data['numerical_column'] >= lower_bound) & (clean_data['numerical_column'] <= upper_bound)]


# Task 4: Standardizing Text Data

# Example: Convert text to lowercase

clean_data['text_column'] = clean_data['text_column'].str.lower()


# Task 5: Removing Special Characters and Punctuation

# Example: Remove all non-alphanumeric characters

clean_data['text_column'] = clean_data['text_column'].str.replace('[^a-zA-Z0-9]', ' ', regex=True)


# Task 6: Handling Date and Time Data

# Example: Convert string to datetime

clean_data['date_column'] = pd.to_datetime(clean_data['date_column'])


# Task 7: Data Type Conversion

# Example: Convert a column to a different data type

clean_data['numeric_column'] = clean_data['numeric_column'].astype(float)


# Task 8: Scaling or Normalizing Numerical Data

# Example: Min-Max Scaling

clean_data['numeric_column'] = (clean_data['numeric_column'] - clean_data['numeric_column'].min()) / (clean_data['numeric_column'].max() - clean_data['numeric_column'].min())


# Task 9: Feature Engineering (Creating new features)

# Example: Calculate a new feature based on existing columns

clean_data['new_feature'] = clean_data['feature1'] + clean_data['feature2']


# Task 10: Handling Categorical Data (e.g., One-Hot Encoding)

# Example: Convert categorical column to one-hot encoding

clean_data = pd.get_dummies(clean_data, columns=['categorical_column'])


# Save the cleaned data to a new file (replace 'cleaned_data.csv' with your desired file name)

clean_data.to_csv('cleaned_data.csv', index=False)


0 views0 comments

Recent Posts

See All

PRACTICE NUMPY

https://www.w3schools.com/python/numpy_exercises.asp https://www.w3resource.com/python-numpy-random https://www.w3resource.com/python-exe...

PRACTICE PANDAS

w3resource Website: https://www.w3resource.com W3Schools Website: https://www.w3schools.com GeeksforGeeks Website:...

PRACTICE PYTHON

Practice Python - https://www.practicepython.org PYnative Python Exercises and Challenges with Solutions - https://pynative.com/python-ex...

留言


bottom of page