pip install pandas
import pandas as pd import numpy as np # Create a numpy array data = np.array([[1, 2], [8,3], [4, 5]]) # 3 rows, 2 columns # 1, 2 8, 3 4, 5 # Create a dataframe df = pd.DataFrame(data) OR df = pd.DataFrame(data, index = ['row1', 'row2', 'row3'], columns = ['col1', 'col2']) OR df = pd.DataFrame(data, columns=['A', 'B']) # Print the dataframe print(df)
# Create an array data = [[1, 2], [8,3], [4, 5]] # 3 rows, 2 columns # 1, 2 8, 3 4, 5 # Create a dataframe df = pd.DataFrame(data) OR df = pd.DataFrame(data, index = ['row1', 'row2', 'row3'], columns = ['col1', 'col2']) OR df = pd.DataFrame(data, columns=['A', 'B']) # Print the dataframe print(df)
import pandas as pd # lists used to create a dictionary states = ['California', 'Texas', 'Florida', 'New York'] population = [38.5, 20.6, 19.8, 40.7] # Storing lists in a dictionary dict_population = {'States': states, 'population': population} # Create a dataframe df = pd.DataFrame(dict_population) # Print the dataframe print(df)
# Read the CSV file import pandas as pd pd.read_csv('data.csv') # Show the first 5 rows of the dataframe print(df.head()) # 5 rows
# Read the CSV file import pandas as pd pd.read_csv('data.csv') # Show the first 5 rows of the DataFrame print(df.head()) # 5 rows # Show the last 5 rows of the DataFrame print(df.tail()) # 5 rows # Print all the rows print(df) # All rows # Print the first n rows of the DataFrame n = 10 print(df.head(n)) # n rows # Print the last n rows of the DataFrame n = 10 print(df.tail(n)) # n rows
# Display all the rows of the DataFrame in Jupyter Notebook # Getting access to the shape attribute of the DataFrame df.shape # (3, 2) # (number of rows, number of columns) Example: (18207, 89) # Display n rows rows = 1000 pd.set_option('display.max_rows', rows) # Display all rows df # shows all rows in Jupyter Notebook
# Getting access to the shape attribute of the DataFrame df.shape # (3, 2) # (number of rows, number of columns) Example: (18207, 89) # Getting access to index attribute of the DataFrame df.index # RangeIndex(start=0, stop=3, step=1) # Getting access to the columns attribute of the DataFrame df.columns # ['A', 'B'] # Getting access to the data types of each column df.dtypes # [object, object, int64] --> object is usually string
# Getting the info of the DataFrame df.info() # information about the DataFrame: how many rows are not null, data types, number of rows and columns, etc. # Basic statistics of the DataFrame df.describe() # min, max, mean, std, etc.
# Getting the length of the DataFrame - number of rows len(df) # 3 # Getting the hightest index of the DataFrame max(df.index) # maximum of value the rows OR df.index.max() # maximum of value the rows # Getting the lowest index of the DataFrame min(df.index) # 0 OR df.index.min() # 0 # Round the values of the DataFrame round(df, 2) # round to 2 decimal places OR df.round(2) # round to 2 decimal places
df['A'] # Selecting the column A - returns a Series
# Check out the data type of a column type(df['Gender']) # pandas.core.series.Series OR df['A'].dtypes # object
# series Attributes and Methods df['Gender'].index # RangeIndex[start=0, stop=1000, step=1] # Get first 5 rows of the series df['Gender'].head()
df.Gender # Gender is a name of the column # This option cannot be used when name of the column contains a space
# Selecting two columns using [[]] df[['A', 'B']] # Selecting the columns A and B - returns a DataFrame df[['gender', 'math score', 'writing score']] # Selecting multiple columns
# Add a new column to the DataFrame df['C'] = df['A'] + df['B'] # Add column C to the DataFrame
# Adding a new column to the DataFrame with an array import numpy as np df['D'] = np.array([1, 2, 3, 4, 5]) # Add column D to the DataFrame OR df['D'] = [1, 2, 3, 4, 5] # Add column D to the DataFrame
# Adding a new column to the DataFrame with an array import numpy as np language_score = np.arrange(0, 1000) # Create an array from 0 to 999 # Verify a length of the range len(language_score) # 1000 # Add column D to the DataFrame df['language score'] = language_score # Add column D to the DataFrame
# Create random integer numbers between 1 to 100 import numpy as np # min value is inclusive, max value is exclusive! language_score = np.random.randint(1, 101, size=1000) # Create an array of 1000 random integers from 1 to 100 # Verify a length of the array len(language_score) # 1000
# min value is inclusive, max value is exclusive! min(language_score) # 1 max(language_score) # 100
# Create random float numbers between 1 to 100 import numpy as np # min value is inclusive, max value is exclusive! language_score = np.random.uniform(1, 101, size=1000) # Create an array of 1000 random floats from 1 to 100
# Select a column and calculate a total sum of the column df['A'].sum() # sum of the column A
# Count, mean, std, min, max of the column # Count the number of rows in the column df['A'].count() # count of the column A
# Calculate the mean (average) of the column df['A'].mean() # average of the column A
# Calculate the standard deviation of the column df['A'].std() # standard deviation of the column A
# Calculate the maximum of the column df['A'].max() # maximum of the column A
# Calculate the minimum of the column df['A'].min() # minimum of the column A
# Same calculation using the .describe() method df.describe() # summary statistics of the DataFrame
# Select a row and calculate a sum of the row df['math score'] + df['writing score'] + df['reading score'] # sum of the row math score, writing score and reading score
df['average score'] = (df['math score'] + df['writing score'] + df['reading score']) / 3 # average score print(df.round(2)) # print the DataFrame, rounded to 2 decimal places
# Count gender elements # len function len(df['gender']) # 1000
# Count gender elements by category df['gender'].value_counts() # female 518 # male 482 # Name: gender, dtype: int64
# Return the relative frequency (divide all values by the sum of values), round to 2 decimals df['gender'].value_counts(normalize=True).round(2) # female 0.51 # male 0.48 # Name: gender, dtype: float64
# Calculate 'level of education' by category df['level_of_education'].value_counts()
# Sort the DataFrame by the column 'A' df.sort_values(by='A') # sort the DataFrame by the column A Same as df.sort_values('A') # sort the DataFrame by the column A
# Sort the DataFrame by the column 'A' in descending order df.sort_values(by='A', ascending=False)
# Sort the DataFrame by the columns 'A' and 'B' in descending order df.sort_values(by=['math score', 'reading score'], ascending=False)
# Sort the DataFrame by the columns 'A' and 'B' in descending order and update the DataFrame df.sort_values(by=['math score', 'reading score'], ascending=False, inplace=True) Same as df = df.sort_values(by=['math score', 'reading score'], ascending=False, inplace=True)
df.sort_values('race'], ascending=False, key= lambda col:col.str.lower())
df = df[~df[MY_COLUMN_NAME].isin(my_list)]
import pandas as pd pd.read_csv('data.csv')
data_frame = pd.read_csv('data.csv') data_frame.shape # (number of rows, number of columns) Example: (18207, 89)
data_frame.describe()
data_frame.values
data_frame[data_frame["Age"]>40].head()
pd.DataFrame(data_frame, columns=['Name', 'Wage', 'Value'])In Jupyter Notebook:
df1 = pd.DataFrame(data_frame, columns=['Name', 'Wage', 'Value']) df1
def value_to_float(x): if type(x) == float or type(x) == int: return x if 'K' in x: if len(x) > 1: return float(x.replace('K', '')) * 1000 return 1000.0 if 'M' in x: if len(x) > 1: return float(x.replace('M', '')) * 1000000 return 1000000.0 if 'B' in x: return float(x.replace('B', '')) * 1000000000 return 0.0
df1 = pd.DataFrame(data_frame, columns=['Name', 'Wage', 'Value']) value = df1['Value'].replace('[\€,]', '', regex=True).apply(value_to_float) wage = df1['Wage'].replace('[\€,]', '', regex=True).apply(value_to_float) df1['Wage'] = wage df1['Value'] = value df1['difference'] = df1['Value'] - df1['Wage'] df1.sort_values('difference', ascending=False) # Used in Jupyter Notebook:
pip install seaborn
import seaborn as sns
import seaborn as sns sns.set() graph = sns.scatterplot(x='Wage', y='Value', data=df1) graph
pip install bokeh
from bokeh.plotting import figure, show from bokeh.models import HoverTool
TOOLTIPS = HoverTool(tooltips=[ ("index", "$index"), ("(Wage,Value)", "(@Wage, @Value)"), ("Name", "@Name")] ) p = figure(title="Soccer 2019", x_axis_label='Wage', y_axis_label='Value', plot_width=700, plot_height=700, tools=[TOOLTIPS]) p.circle('Wage', 'Value', size=10, source=df1) show(p)
Also have a datasets to use for training and testing
from sklearn.datasets import load_iris iris = load_iris()
X=iris.data # data (the input) y=iris.target # Target (the answer) # f(X)=y We need to find the best parameters for the model feature_names = iris.feature_names target_names = iris.target_names feature_names
target_names
= Split arrays or matrices into random train and test subsets.
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # test_size=0.2 means 20% training and 80% test
print(X_train.shape) print(X_test.shape) (120, 4) # train data has 120 rows, 4 columns (30, 4) # test data has 30 rows, 4 columns
from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=3) # n_neighbors=3 means how many data points around me do I need to observe to decide which type of flower I am? knn = knn.fit(X_train, y_train)
y_prediction = knn.predict(X_test) # Predict what type of flower might be for the provided data
from sklearn import metrics print(metrics.accuracy_score(y_test, y_prediction)) # Accuracy is the percentage of correct predictions 0.9666666666666667 # 96% accuracy
from sklearn.tree import DecisionTreeClassifier knn = DecisionTreeClassifier() knn = knn.fit(X_train, y_train)
y_prediction = knn.predict(X_test) # Predict what type of flower might be for the provided data
from sklearn import metrics print(metrics.accuracy_score(y_test, y_prediction)) # Accuracy is the percentage of correct predictions 0.9666666666666667 # 96% accuracy
sample = [[3,5,4,2], [2,3,5,4]] predictions = knn.predict(sample) pred_species = [iris.target_names[p] for p in predictions] print("predictions: ", pred_species)
predictions: ['versicolor', 'virginica']
from joblib import dump, load dump(knn, 'mlbrain.joblib')
['mlbrain.joblib']
model = load('mlbrain.joblib') model.predict(X_test)
array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 2, 0, 1, 1, 0, 0, 2, 1, 0, 0, 2, 0, 0, 1, 1, 0])
sample = [[3,5,4,2], [2,3,5,4]] predictions = knn.predict(sample) pred_species = [iris.target_names[p] for p in predictions] print("predictions: ", pred_species)
predictions: ['versicolor', 'virginica']