Data Manipulation with Pandas
Pandas
• Pandas is a Python library used for working with data sets.
• It has functions for analyzing, cleaning, exploring, and
manipulating data.
• The name "Pandas" has a reference to both "Panel Data", and
"Python Data Analysis" and was created by Wes McKinney in
2008.
• Pandas allows us to analyze big data and make conclusions
based on statistical theories.
• Pandas can clean messy data sets, and make them readable
and relevant.
• Relevant data is very important in data science.
• pip install pandas
• import pandas
• Ex
import pandas
mydataset = {
'cars': ["BMW", "Volvo", "Ford"],
'passings': [3, 7, 2]
}
myvar = pandas.DataFrame(mydataset)
print(myvar)
• import pandas as pd
• print(pd.__version__)
• like a column in a table.
• a one-dimensional array holding data of any type.
• import pandas as pd
a = [1, 7, 2]
myvar = pd.Series(a)
print(myvar)
• import pandas as pd
a = [1, 7, 2]
myvar = pd.Series(a, index = ["x", "y", "z"])
print(myvar)
Pandas Series
import pandas as pd
calories = {"day1": 420, "day2": 380, "day3": 390}
myvar = pd.Series(calories)
print(myvar)
import pandas as pd
calories = {"day1": 420, "day2": 380, "day3": 390}
myvar = pd.Series(calories, index = ["day1", "day2"])
print(myvar)
import pandas as pd
data = {
"calories": [420, 380, 390],
"duration": [50, 40, 45]
}
myvar = pd.DataFrame(data)
print(myvar)
print(df.loc[0]) #locate row
print(df.loc[[0, 1]])
import pandas as pd
data = {
"calories": [420, 380, 390],
"duration": [50, 40, 45]
}
df = pd.DataFrame(data, index = ["day1", "day2", "day3"]) # named index
print(df)
print(df.loc["day2"])
DataFrames
• import pandas as pd
df = pd.read_csv('data.csv')
print(df)
• import pandas as pd
print(pd.options.display.max_rows)
• import pandas as pd
pd.options.display.max_rows = 9999
df = pd.read_csv('data.csv')
print(df)
• import pandas as pd
df = pd.read_json('data.json')
print(df.to_string())
• import pandas as pd
data = {
"Duration":{
"0":60,
"1":60,
"2":60,
"3":45,
"4":45,
"5":60
},
"Pulse":{
"0":110,
"1":117,
"2":103,
"3":109,
"4":117,
"5":102
},
"Maxpulse":{
"0":130,
"1":145,
"2":135,
"3":175,
"4":148,
"5":127
},
"Calories":{
"0":409,
"1":479,
"2":340,
"3":282,
"4":406,
"5":300
}
}
df = pd.DataFrame(data)
• print(df.head(10))
• print(df.head())
• print(df.tail())
• print(df.info())
Data Cleaning
• Data cleaning means fixing bad data in your
data set.
• Bad data could be:
• Empty cells
• Data in wrong format
• Wrong data
• Duplicates
• import pandas as pd
df = pd.read_csv('data.csv')
new_df = df.dropna()
print(new_df.to_string())
• df.dropna(inplace = True) #df.dropna(subset=['Date'], inplace = True)
import pandas as pd
df = pd.read_csv('data.csv')
df.fillna(130, inplace = True)
• import pandas as pd
df = pd.read_csv('data.csv')
df["Calories"].fillna(130, inplace = True)
• import pandas as pd
df = pd.read_csv('data.csv')
x = df["Calories"].mean()
df["Calories"].fillna(x, inplace = True)
• x = df["Calories"].median()
df["Calories"].fillna(x, inplace = True)
• import pandas as pd
df = pd.read_csv('data.csv')
x = df["Calories"].mode()[0]
df["Calories"].fillna(x, inplace = True)
• import pandas as pd
df = pd.read_csv('data.csv')
df['Date'] = pd.to_datetime(df['Date'])
print(df.to_string())
• df.loc[7, 'Duration'] = 45
• for x in df.index:
if df.loc[x, "Duration"] > 120:
df.loc[x, "Duration"] = 120
• for x in df.index:
if df.loc[x, "Duration"] > 120:
df.drop(x, inplace = True)
• print(df.duplicated())
• df.drop_duplicates(inplace = True)
Data Correlations
• df.corr()
• Perfect Correlation:
We can see that "Duration" and "Duration" got the number 1.000000,
which makes sense, each column always has a perfect relationship with
itself.
• Good Correlation:
"Duration" and "Calories" got a 0.922721 correlation, which is a very
good correlation, and we can predict that the longer you work out, the
more calories you burn
• Bad Correlation:
"Duration" and "Maxpulse" got a 0.009403 correlation, which is a very
bad correlation, meaning that we can not predict the max pulse by just
looking at the duration of the work out
Pandas - Plotting
• import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('data.csv')
df.plot()
plt.show()
• import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('data.csv')
df.plot(kind = 'scatter', x = 'Duration', y = 'Calories')
plt.show()
• import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('data.csv')
df.plot(kind = 'scatter', x = 'Duration', y = 'Maxpulse')
plt.show()
• df["Duration"].plot(kind = 'hist')
Data Indexing and Selection
import pandas as pd
# making data frame from csv file
data = pd.read_csv("nba.csv", index_col ="Name")
# retrieving columns by indexing operator
first = data["Age"]
print(first)
# importing pandas package
import pandas as pd
# making data frame from csv file
data = pd.read_csv("nba.csv", index_col ="Name")
# retrieving multiple columns by indexing operator
first = data[["Age", "College", "Salary"]]
print(first )
• # importing pandas package
• import pandas as pd
• # making data frame from csv file
• data = pd.read_csv("nba.csv", index_col ="Name")
• # retrieving row by loc method
• first = data.loc["Avery Bradley"]
• second = data.loc["R.J. Hunter"]
• print(first, "nnn", second)
• import pandas as pd
• # making data frame from csv file
• data = pd.read_csv("nba.csv", index_col ="Name")
• # retrieving multiple rows by loc method
• first = data.loc[["Avery Bradley", "R.J. Hunter"]]
• import pandas as pd
• # making data frame from csv file
• data = pd.read_csv("nba.csv", index_col ="Name")
• # retrieving two rows and three columns by loc method
• first = data.loc[["Avery Bradley", "R.J. Hunter"],
• ["Team", "Number", "Position"]]
• print(first)
• import pandas as pd
• # making data frame from csv file
• data = pd.read_csv("nba.csv", index_col ="Name")
• # retrieving all rows and some columns by loc method
• first = data.loc[:, ["Team", "Number", "Position"]]
• print(first)
• import pandas as pd
• # making data frame from csv file
• data = pd.read_csv("nba.csv", index_col ="Name")
• # retrieving rows by iloc method
• row2 = data.iloc[3]
• print(row2)
•
• import pandas as pd
• # making data frame from csv file
• data = pd.read_csv("nba.csv", index_col ="Name")
• # retrieving multiple rows by iloc method
• row2 = data.iloc [[3, 5, 7]]
• row2
• import pandas as pd
• # making data frame from csv file
• data = pd.read_csv("nba.csv", index_col ="Name")
• # retrieving two rows and two columns by iloc method
• row2 = data.iloc [[3, 4], [1, 2]]
• print(row2)
• import pandas as pd
• # making data frame from csv file
• data = pd.read_csv("nba.csv", index_col ="Name")
• # retrieving all rows and some columns by iloc method
• row2 = data.iloc [:, [1, 2]]
• print(row2)
• import pandas as pd
• # making data frame from csv file
• data = pd.read_csv("nba.csv", index_col ="Name")
• # retrieving row by ix method
• first = data.ix["Avery Bradley"] #index slicing
• print(first)
•
• import pandas as pd
• # making data frame from csv file
• data = pd.read_csv("nba.csv", index_col ="Name")
• # retrieving row by ix method
• first = data.ix[1]
• print(first)
Function Description
Dataframe.head() Return top n rows of a data frame.
Dataframe.tail() Return bottom n rows of a data frame.
Dataframe.at[] Access a single value for a row/column label pair.
Dataframe.iat[] Access a single value for a row/column pair by integer position.
Dataframe.tail() Purely integer-location based indexing for selection by position.
DataFrame.lookup() Label-based “fancy indexing” function for DataFrame.
DataFrame.pop() Return item and drop from frame.
DataFrame.xs() Returns a cross-section (row(s) or column(s)) from the DataFrame.
DataFrame.get() Get item from object for given key (DataFrame column, Panel slice, etc.).
DataFrame.isin() Return boolean DataFrame showing whether each element in the DataFrame
is contained in values.
DataFrame.where()
Return an object of same shape as self and whose corresponding entries are
from self where cond is True and otherwise are from other.
DataFrame.mask() Return an object of same shape as self and whose corresponding entries are
from self where cond is False and otherwise are from other.
DataFrame.query() Query the columns of a frame with a boolean expression.
DataFrame.insert() Insert column into DataFrame at specified location.
• df = pd.DataFrame([['1990', 'a', 5, 4, 7, 2], ['1991', 'c', 10, 1, 2, 0], ['1992', 'd', 2, 1, 4, 12],
['1993', 'a', 5, 8, 11, 6]], columns=('Date', 'best', 'a', 'b', 'c', 'd'))
• df['value'] = df.lookup(df.index, df['best'])
• df.at[2,’Date’]
• df.iat[1,1]
• df.pop(‘b')
• df.get(‘best’)
• df.xs(0) #index df.set_index(best) df.xs(‘a’)
• df.isin([1990, 1991])
• df.isin({‘Date': [1990, 1991]})
• df.isin({‘best':[‘a’,’c’],‘a':[2]})
• newdf = df.where(df[“a"] > 3)
• newdf = df.mask(df[“a"] > 3)
• print(df.query('a > 3'))
• df.insert(1, “value", [50, 40, 30,20])
Python Pandas Data operations
• A = pd.Series([2, 4, 6], index=[0, 1, 2])
• B = pd.Series([1, 3, 5], index=[1, 2, 3])
• A + B
• A.add(B, fill_value=0)
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
'California': 423967}, name='area')
population = pd.Series({'California': 38332521,
'Texas': 26448193, 'New York': 19651127},
name='population')
area.index | population.index
population / area
• A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
columns=list('AB'))
• B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
columns=list('BAC'))
• A + B
• fill = A.stack().mean()
• A.add(B, fill_value=fill)
Python Operator Pandas Method(s)
• + add()
• - sub(), subtract()
• * mul(), multiply()
• / truediv(), div(), divide()
• // floordiv()
• % mod()
• ** pow()
Hierarchical Indexes are also known as multi-indexing is setting more than one column name as
the index.
• import pandas as pd
• df = pd.read_csv('homelessness.csv')
• print(df.head())
• col = df.columns
• print(col)
• # using the pandas set_index() function.
• df_ind3 = df.set_index(['region', 'state', 'individuals'])
• # we can sort the data by using sort_index()
• df_ind3.sort_index()
• print(df_ind3.head(10))
• df_ind3_region = df_ind3.loc[['Pacific', 'Mountain']]
• print(df_ind3_region.head(10))
Merge, Join, and Concatenate DataFrames Using Pandas
# Creating first dataframe
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
# Creating second dataframe
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
# Creating third dataframe
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
'B': ['B8', 'B9', 'B10', 'B11'],
'C': ['C8', 'C9', 'C10', 'C11'],
'D': ['D8', 'D9', 'D10', 'D11']},
index=[8, 9, 10, 11])
# Concatenating the dataframes
pd.concat([df1, df2, df3])
•
• # Dataframe created
• left = pd.DataFrame({'Key': ['K0', 'K1', 'K2', 'K3'],
• 'A': ['A0', 'A1', 'A2', 'A3'],
• 'B': ['B0', 'B1', 'B2', 'B3']})
•
• right = pd.DataFrame({'Key': ['K0', 'K1', 'K2', 'K3'],
• 'C': ['C0', 'C1', 'C2', 'C3'],
• 'D': ['D0', 'D1', 'D2', 'D3']})
•
• # Merging the dataframes
• pd.merge(left, right, how='inner', on='Key')
•
• left_merged = pd.merge(left, right, how='left', on='Key')
• print(left_merged)
•
• right_merged = pd.merge(left, right, how='right', on='Key')
• print(right_merged)
•
• outer_merged = pd.merge(left, right, how='outer', on='Key')
• print(outer_merged)
• left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
• 'B': ['B0', 'B1', 'B2', 'B3']},
• index=['K0', 'K1', 'K2', 'K3'])
•
• right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'],
• 'D': ['D0', 'D1', 'D2', 'D3']},
• index=['K0', 'K1', 'K2', 'K3'])
•
• # Joining the dataframes
• left.join(right)
• import pandas as pd
•
• # Creating our dataset
• df = pd.DataFrame([[9, 4, 8, 9],
• [8, 10, 7, 6],
• [7, 6, 8, 5]],
• columns=['Maths', 'English',
• 'Science', 'History'])
•
• # display dataset
• print(df)
• df.describe()
• df.agg(['sum', 'min', 'max'])
• df.groupby(by=['Maths'])
• a = df.groupby('Maths')
• a.first()
• b = df.groupby(['Maths', 'Science'])
• b.first()
• dataset.groupby('cut').sum()
• dataset.groupby(['cut', 'color']).agg('min')
• sum() :Compute sum of column values
• min() :Compute min of column values
• max() :Compute max of column values
• mean() :Compute mean of column
• size() :Compute column sizes
• describe() :Generates descriptive statistics
• first() :Compute first of group values
• last() :Compute last of group values
• count() :Compute count of column values
• std() :Standard deviation of column
• var() :Compute variance of column
• sem() :Standard error of the mean of column
Data_Manipulation_with_Pandas that manipulation used

Data_Manipulation_with_Pandas that manipulation used

  • 1.
  • 2.
    Pandas • Pandas isa Python library used for working with data sets. • It has functions for analyzing, cleaning, exploring, and manipulating data. • The name "Pandas" has a reference to both "Panel Data", and "Python Data Analysis" and was created by Wes McKinney in 2008. • Pandas allows us to analyze big data and make conclusions based on statistical theories. • Pandas can clean messy data sets, and make them readable and relevant. • Relevant data is very important in data science.
  • 3.
    • pip installpandas • import pandas • Ex import pandas mydataset = { 'cars': ["BMW", "Volvo", "Ford"], 'passings': [3, 7, 2] } myvar = pandas.DataFrame(mydataset) print(myvar) • import pandas as pd • print(pd.__version__)
  • 4.
    • like acolumn in a table. • a one-dimensional array holding data of any type. • import pandas as pd a = [1, 7, 2] myvar = pd.Series(a) print(myvar) • import pandas as pd a = [1, 7, 2] myvar = pd.Series(a, index = ["x", "y", "z"]) print(myvar) Pandas Series
  • 5.
    import pandas aspd calories = {"day1": 420, "day2": 380, "day3": 390} myvar = pd.Series(calories) print(myvar) import pandas as pd calories = {"day1": 420, "day2": 380, "day3": 390} myvar = pd.Series(calories, index = ["day1", "day2"]) print(myvar)
  • 6.
    import pandas aspd data = { "calories": [420, 380, 390], "duration": [50, 40, 45] } myvar = pd.DataFrame(data) print(myvar) print(df.loc[0]) #locate row print(df.loc[[0, 1]]) import pandas as pd data = { "calories": [420, 380, 390], "duration": [50, 40, 45] } df = pd.DataFrame(data, index = ["day1", "day2", "day3"]) # named index print(df) print(df.loc["day2"]) DataFrames
  • 7.
    • import pandasas pd df = pd.read_csv('data.csv') print(df) • import pandas as pd print(pd.options.display.max_rows) • import pandas as pd pd.options.display.max_rows = 9999 df = pd.read_csv('data.csv') print(df)
  • 8.
    • import pandasas pd df = pd.read_json('data.json') print(df.to_string()) • import pandas as pd data = { "Duration":{ "0":60, "1":60, "2":60, "3":45, "4":45, "5":60 }, "Pulse":{ "0":110, "1":117, "2":103, "3":109, "4":117, "5":102 }, "Maxpulse":{ "0":130, "1":145, "2":135, "3":175, "4":148, "5":127 }, "Calories":{ "0":409, "1":479, "2":340, "3":282, "4":406, "5":300 } } df = pd.DataFrame(data)
  • 9.
    • print(df.head(10)) • print(df.head()) •print(df.tail()) • print(df.info())
  • 10.
    Data Cleaning • Datacleaning means fixing bad data in your data set. • Bad data could be: • Empty cells • Data in wrong format • Wrong data • Duplicates
  • 11.
    • import pandasas pd df = pd.read_csv('data.csv') new_df = df.dropna() print(new_df.to_string()) • df.dropna(inplace = True) #df.dropna(subset=['Date'], inplace = True) import pandas as pd df = pd.read_csv('data.csv') df.fillna(130, inplace = True) • import pandas as pd df = pd.read_csv('data.csv') df["Calories"].fillna(130, inplace = True)
  • 12.
    • import pandasas pd df = pd.read_csv('data.csv') x = df["Calories"].mean() df["Calories"].fillna(x, inplace = True) • x = df["Calories"].median() df["Calories"].fillna(x, inplace = True) • import pandas as pd df = pd.read_csv('data.csv') x = df["Calories"].mode()[0] df["Calories"].fillna(x, inplace = True)
  • 13.
    • import pandasas pd df = pd.read_csv('data.csv') df['Date'] = pd.to_datetime(df['Date']) print(df.to_string()) • df.loc[7, 'Duration'] = 45 • for x in df.index: if df.loc[x, "Duration"] > 120: df.loc[x, "Duration"] = 120 • for x in df.index: if df.loc[x, "Duration"] > 120: df.drop(x, inplace = True)
  • 14.
  • 15.
    Data Correlations • df.corr() •Perfect Correlation: We can see that "Duration" and "Duration" got the number 1.000000, which makes sense, each column always has a perfect relationship with itself. • Good Correlation: "Duration" and "Calories" got a 0.922721 correlation, which is a very good correlation, and we can predict that the longer you work out, the more calories you burn • Bad Correlation: "Duration" and "Maxpulse" got a 0.009403 correlation, which is a very bad correlation, meaning that we can not predict the max pulse by just looking at the duration of the work out
  • 16.
    Pandas - Plotting •import pandas as pd import matplotlib.pyplot as plt df = pd.read_csv('data.csv') df.plot() plt.show() • import pandas as pd import matplotlib.pyplot as plt df = pd.read_csv('data.csv') df.plot(kind = 'scatter', x = 'Duration', y = 'Calories') plt.show() • import pandas as pd import matplotlib.pyplot as plt df = pd.read_csv('data.csv') df.plot(kind = 'scatter', x = 'Duration', y = 'Maxpulse') plt.show() • df["Duration"].plot(kind = 'hist')
  • 17.
    Data Indexing andSelection import pandas as pd # making data frame from csv file data = pd.read_csv("nba.csv", index_col ="Name") # retrieving columns by indexing operator first = data["Age"] print(first) # importing pandas package import pandas as pd # making data frame from csv file data = pd.read_csv("nba.csv", index_col ="Name") # retrieving multiple columns by indexing operator first = data[["Age", "College", "Salary"]] print(first )
  • 18.
    • # importingpandas package • import pandas as pd • # making data frame from csv file • data = pd.read_csv("nba.csv", index_col ="Name") • # retrieving row by loc method • first = data.loc["Avery Bradley"] • second = data.loc["R.J. Hunter"] • print(first, "nnn", second) • import pandas as pd • # making data frame from csv file • data = pd.read_csv("nba.csv", index_col ="Name") • # retrieving multiple rows by loc method • first = data.loc[["Avery Bradley", "R.J. Hunter"]]
  • 19.
    • import pandasas pd • # making data frame from csv file • data = pd.read_csv("nba.csv", index_col ="Name") • # retrieving two rows and three columns by loc method • first = data.loc[["Avery Bradley", "R.J. Hunter"], • ["Team", "Number", "Position"]] • print(first) • import pandas as pd • # making data frame from csv file • data = pd.read_csv("nba.csv", index_col ="Name") • # retrieving all rows and some columns by loc method • first = data.loc[:, ["Team", "Number", "Position"]] • print(first)
  • 20.
    • import pandasas pd • # making data frame from csv file • data = pd.read_csv("nba.csv", index_col ="Name") • # retrieving rows by iloc method • row2 = data.iloc[3] • print(row2) • • import pandas as pd • # making data frame from csv file • data = pd.read_csv("nba.csv", index_col ="Name") • # retrieving multiple rows by iloc method • row2 = data.iloc [[3, 5, 7]] • row2
  • 21.
    • import pandasas pd • # making data frame from csv file • data = pd.read_csv("nba.csv", index_col ="Name") • # retrieving two rows and two columns by iloc method • row2 = data.iloc [[3, 4], [1, 2]] • print(row2) • import pandas as pd • # making data frame from csv file • data = pd.read_csv("nba.csv", index_col ="Name") • # retrieving all rows and some columns by iloc method • row2 = data.iloc [:, [1, 2]] • print(row2)
  • 22.
    • import pandasas pd • # making data frame from csv file • data = pd.read_csv("nba.csv", index_col ="Name") • # retrieving row by ix method • first = data.ix["Avery Bradley"] #index slicing • print(first) • • import pandas as pd • # making data frame from csv file • data = pd.read_csv("nba.csv", index_col ="Name") • # retrieving row by ix method • first = data.ix[1] • print(first)
  • 23.
    Function Description Dataframe.head() Returntop n rows of a data frame. Dataframe.tail() Return bottom n rows of a data frame. Dataframe.at[] Access a single value for a row/column label pair. Dataframe.iat[] Access a single value for a row/column pair by integer position. Dataframe.tail() Purely integer-location based indexing for selection by position. DataFrame.lookup() Label-based “fancy indexing” function for DataFrame. DataFrame.pop() Return item and drop from frame. DataFrame.xs() Returns a cross-section (row(s) or column(s)) from the DataFrame. DataFrame.get() Get item from object for given key (DataFrame column, Panel slice, etc.). DataFrame.isin() Return boolean DataFrame showing whether each element in the DataFrame is contained in values. DataFrame.where() Return an object of same shape as self and whose corresponding entries are from self where cond is True and otherwise are from other. DataFrame.mask() Return an object of same shape as self and whose corresponding entries are from self where cond is False and otherwise are from other. DataFrame.query() Query the columns of a frame with a boolean expression. DataFrame.insert() Insert column into DataFrame at specified location.
  • 24.
    • df =pd.DataFrame([['1990', 'a', 5, 4, 7, 2], ['1991', 'c', 10, 1, 2, 0], ['1992', 'd', 2, 1, 4, 12], ['1993', 'a', 5, 8, 11, 6]], columns=('Date', 'best', 'a', 'b', 'c', 'd')) • df['value'] = df.lookup(df.index, df['best']) • df.at[2,’Date’] • df.iat[1,1] • df.pop(‘b') • df.get(‘best’) • df.xs(0) #index df.set_index(best) df.xs(‘a’) • df.isin([1990, 1991]) • df.isin({‘Date': [1990, 1991]}) • df.isin({‘best':[‘a’,’c’],‘a':[2]}) • newdf = df.where(df[“a"] > 3) • newdf = df.mask(df[“a"] > 3) • print(df.query('a > 3')) • df.insert(1, “value", [50, 40, 30,20])
  • 25.
    Python Pandas Dataoperations • A = pd.Series([2, 4, 6], index=[0, 1, 2]) • B = pd.Series([1, 3, 5], index=[1, 2, 3]) • A + B • A.add(B, fill_value=0)
  • 26.
    area = pd.Series({'Alaska':1723337, 'Texas': 695662, 'California': 423967}, name='area') population = pd.Series({'California': 38332521, 'Texas': 26448193, 'New York': 19651127}, name='population') area.index | population.index population / area
  • 27.
    • A =pd.DataFrame(rng.randint(0, 20, (2, 2)), columns=list('AB')) • B = pd.DataFrame(rng.randint(0, 10, (3, 3)), columns=list('BAC')) • A + B • fill = A.stack().mean() • A.add(B, fill_value=fill)
  • 28.
    Python Operator PandasMethod(s) • + add() • - sub(), subtract() • * mul(), multiply() • / truediv(), div(), divide() • // floordiv() • % mod() • ** pow()
  • 29.
    Hierarchical Indexes arealso known as multi-indexing is setting more than one column name as the index. • import pandas as pd • df = pd.read_csv('homelessness.csv') • print(df.head()) • col = df.columns • print(col) • # using the pandas set_index() function. • df_ind3 = df.set_index(['region', 'state', 'individuals']) • # we can sort the data by using sort_index() • df_ind3.sort_index() • print(df_ind3.head(10)) • df_ind3_region = df_ind3.loc[['Pacific', 'Mountain']] • print(df_ind3_region.head(10))
  • 30.
    Merge, Join, andConcatenate DataFrames Using Pandas # Creating first dataframe df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}, index=[0, 1, 2, 3]) # Creating second dataframe df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'], 'B': ['B4', 'B5', 'B6', 'B7'], 'C': ['C4', 'C5', 'C6', 'C7'], 'D': ['D4', 'D5', 'D6', 'D7']}, index=[4, 5, 6, 7]) # Creating third dataframe df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'], 'B': ['B8', 'B9', 'B10', 'B11'], 'C': ['C8', 'C9', 'C10', 'C11'], 'D': ['D8', 'D9', 'D10', 'D11']}, index=[8, 9, 10, 11]) # Concatenating the dataframes pd.concat([df1, df2, df3])
  • 31.
    • • # Dataframecreated • left = pd.DataFrame({'Key': ['K0', 'K1', 'K2', 'K3'], • 'A': ['A0', 'A1', 'A2', 'A3'], • 'B': ['B0', 'B1', 'B2', 'B3']}) • • right = pd.DataFrame({'Key': ['K0', 'K1', 'K2', 'K3'], • 'C': ['C0', 'C1', 'C2', 'C3'], • 'D': ['D0', 'D1', 'D2', 'D3']}) • • # Merging the dataframes • pd.merge(left, right, how='inner', on='Key') • • left_merged = pd.merge(left, right, how='left', on='Key') • print(left_merged) • • right_merged = pd.merge(left, right, how='right', on='Key') • print(right_merged) • • outer_merged = pd.merge(left, right, how='outer', on='Key') • print(outer_merged)
  • 32.
    • left =pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], • 'B': ['B0', 'B1', 'B2', 'B3']}, • index=['K0', 'K1', 'K2', 'K3']) • • right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], • 'D': ['D0', 'D1', 'D2', 'D3']}, • index=['K0', 'K1', 'K2', 'K3']) • • # Joining the dataframes • left.join(right)
  • 33.
    • import pandasas pd • • # Creating our dataset • df = pd.DataFrame([[9, 4, 8, 9], • [8, 10, 7, 6], • [7, 6, 8, 5]], • columns=['Maths', 'English', • 'Science', 'History']) • • # display dataset • print(df) • df.describe() • df.agg(['sum', 'min', 'max']) • df.groupby(by=['Maths']) • a = df.groupby('Maths') • a.first() • b = df.groupby(['Maths', 'Science']) • b.first() • dataset.groupby('cut').sum() • dataset.groupby(['cut', 'color']).agg('min')
  • 34.
    • sum() :Computesum of column values • min() :Compute min of column values • max() :Compute max of column values • mean() :Compute mean of column • size() :Compute column sizes • describe() :Generates descriptive statistics • first() :Compute first of group values • last() :Compute last of group values • count() :Compute count of column values • std() :Standard deviation of column • var() :Compute variance of column • sem() :Standard error of the mean of column