import pandas as pd
import numpy as np

pd.__version__

'2.2.3'

obj=pd.Series([1,"John",3.5,"Hey"])
obj

0       1
1    John
2     3.5
3     Hey
dtype: object

obj.values

array([1, 'John', 3.5, 'Hey'], dtype=object)

obj2=pd.Series([1,"John",3.5,"Hey"],index=["a","b","c","d"])
obj2

a       1
b    John
c     3.5
d     Hey
dtype: object

obj2["b"]

'John'

obj2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

score={"Jane":90, "Bill":80,"Elon":85,"Tom":75,"Tim":95}
names=pd.Series(score) # Convert to Series 
names

Jane    90
Bill    80
Elon    85
Tom     75
Tim     95
dtype: int64

names["Tim"]

95

names[names>=85]

Jane    90
Elon    85
Tim     95
dtype: int64

names["Tom"]=60
names

Jane    90
Bill    80
Elon    85
Tom     60
Tim     95
dtype: int64

names[names<=80]=83
names

Jane    90
Bill    83
Elon    85
Tom     83
Tim     95
dtype: int64

"Tom" in names

True

names/10

Jane    9.0
Bill    8.3
Elon    8.5
Tom     8.3
Tim     9.5
dtype: float64

names**2

Jane    8100
Bill    6889
Elon    7225
Tom     6889
Tim     9025
dtype: int64

names.isnull()

Jane    False
Bill    False
Elon    False
Tom     False
Tim     False
dtype: bool

games=pd.read_csv("https://raw.githubusercontent.com/TirendazAcademy/PANDAS-TUTORIAL/refs/heads/main/DataSets/vgsalesGlobale.csv")

games.head()

games.dtypes

Rank              int64
Name             object
Platform         object
Year            float64
Genre            object
Publisher        object
NA_Sales        float64
EU_Sales        float64
JP_Sales        float64
Other_Sales     float64
Global_Sales    float64
dtype: object

games.Genre.describe()

count      16598
unique        12
top       Action
freq        3316
Name: Genre, dtype: object

games.Genre.value_counts()

Genre
Action          3316
Sports          2346
Misc            1739
Role-Playing    1488
Shooter         1310
Adventure       1286
Racing          1249
Platform         886
Simulation       867
Fighting         848
Strategy         681
Puzzle           582
Name: count, dtype: int64

games.Genre.value_counts(normalize=True)

Genre
Action          0.199783
Sports          0.141342
Misc            0.104772
Role-Playing    0.089649
Shooter         0.078925
Adventure       0.077479
Racing          0.075250
Platform        0.053380
Simulation      0.052235
Fighting        0.051090
Strategy        0.041029
Puzzle          0.035064
Name: proportion, dtype: float64

type(games.Genre.value_counts())

pandas.core.series.Series

games.Genre.unique()

array(['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Misc',
       'Shooter', 'Simulation', 'Action', 'Fighting', 'Adventure',
       'Strategy'], dtype=object)

games.Genre.nunique()

12

pd.crosstab(games.Genre, games.Year)

games.Global_Sales.describe()

count    16598.000000
mean         0.537441
std          1.555028
min          0.010000
25%          0.060000
50%          0.170000
75%          0.470000
max         82.740000
Name: Global_Sales, dtype: float64

print(games.Global_Sales.mean())

print(games.Global_Sales.median())

print(games.Global_Sales.std())

print(games.Global_Sales.max())

0.5374406555006628
0.17
1.5550279355699124
82.74

games.Global_Sales.value_counts()

Global_Sales
0.02    1071
0.03     811
0.04     645
0.05     632
0.01     618
        ... 
5.01       1
5.05       1
5.07       1
5.11       1
3.16       1
Name: count, Length: 623, dtype: int64

games.Year.plot(kind="hist")

<Axes: ylabel='Frequency'>

games.Year.plot(kind="box")

<Axes: >

games.Year.plot(kind="kde")

<Axes: ylabel='Density'>

games.Genre.value_counts().plot(kind="bar")

<Axes: xlabel='Genre'>

data={"name":["Bill","Tom","Tim","John","Alex","Vanessa","Kate"],      
      "score":[90,80,85,75,95,60,65],      
      "sport":["Wrestling","Football","Skiing","Swimming","Tennis",
               "Karete","Surfing"],      
      "sex":["M","M","M","M","F","F","F"]}

df=pd.DataFrame(data)
df

df=pd.DataFrame(data,columns=["name","sport","sex","score"])
df

df=pd.DataFrame(data,columns=["name", "sport", "gender", "score", "age"],
                index=["one","two","three","four","five","six","seven"])
df

df["sport"]

one      Wrestling
two       Football
three       Skiing
four      Swimming
five        Tennis
six         Karete
seven      Surfing
Name: sport, dtype: object

my_columns=["name","sport"]
df[my_columns]

df.sport

one      Wrestling
two       Football
three       Skiing
four      Swimming
five        Tennis
six         Karete
seven      Surfing
Name: sport, dtype: object

df.loc[["one"]]

df.loc[["one","two"]]

df["age"]=18

df=pd.DataFrame(data,columns=["name", "sport", "gender", "score", "age"], 
                index=["one","two","three","four","five","six","seven"])
values=[18,19,20,18,17,17,18]
df["age"]=values
df

df["pass"]=df.score>=70
df

del df["pass"]
df

scores={"Math":{"A":85,"B":90,"C":95}, "Physics":{"A":90,"B":80,"C":75}}

scores_df=pd.DataFrame(scores)
scores_df

scores_df.T

scores_df.index.name="name"
scores_df.columns.name="lesson"
scores_df

scores_df.values

array([[85, 90],
       [90, 80],
       [95, 75]])

scores_index=scores_df.index

scores_index[1]="Jack"
scores_index

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[58], line 1
----> 1 scores_index[1]="Jack"
      2 scores_index

File ~/work/AI/blog/.venv/lib/python3.12/site-packages/pandas/core/indexes/base.py:5371, in Index.__setitem__(self, key, value)
   5369 @final
   5370 def __setitem__(self, key, value) -> None:
-> 5371     raise TypeError("Index does not support mutable operations")

TypeError: Index does not support mutable operations

import numpy as np

obj=pd.Series(np.arange(5),
              index=["a","b","c","d","e"])
obj

a    0
b    1
c    2
d    3
e    4
dtype: int64

obj["c"]

2

obj[2]

/var/folders/59/c32_bthx48jd9m2ym5m3tnpw0000j7/T/ipykernel_18768/1662947756.py:1: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  obj[2]

2

obj[0:3]

a    0
b    1
c    2
dtype: int64

obj[["a","c"]]

a    0
c    2
dtype: int64

obj[[0,2]]

/var/folders/59/c32_bthx48jd9m2ym5m3tnpw0000j7/T/ipykernel_18768/1746387968.py:1: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  obj[[0,2]]

a    0
c    2
dtype: int64

obj[obj<2]

a    0
b    1
dtype: int64

obj["a":"c"]

a    0
b    1
c    2
dtype: int64

obj["b":"c"]=5
obj

a    0
b    5
c    5
d    3
e    4
dtype: int64

data=pd.DataFrame(
    np.arange(16).reshape(4,4),
    index=["London","Paris",
           "Berlin","Istanbul"],
    columns=["one","two","three","four"])
data

data["two"]

London       1
Paris        5
Berlin       9
Istanbul    13
Name: two, dtype: int64

data[["one","two"]]

data[:3]

data[data["four"]>5]

data[data<5]=0
data

data.iloc[1]

one      0
two      5
three    6
four     7
Name: Paris, dtype: int64

data.iloc[1,[1,2,3]]

two      5
three    6
four     7
Name: Paris, dtype: int64

data.loc["Paris",["one","two"]]

one    0
two    5
Name: Paris, dtype: int64

data.loc[:"Paris","four"]

London    0
Paris     7
Name: four, dtype: int64

toy_data=pd.Series(np.arange(5),
                   index=["a","b","c",
                          "d","e"])
toy_data

a    0
b    1
c    2
d    3
e    4
dtype: int64

toy_data[-1]

/var/folders/59/c32_bthx48jd9m2ym5m3tnpw0000j7/T/ipykernel_18768/3728369251.py:1: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  toy_data[-1]

4

s=pd.Series([1,2,3,4],
            index=["a","b","c","d"])
s

a    1
b    2
c    3
d    4
dtype: int64

s2=s.reindex(["b","d","a","c","e"])
s2

b    2.0
d    4.0
a    1.0
c    3.0
e    NaN
dtype: float64

s3=pd.Series(["blue","yellow","purple"],
             index=[0,2,4])
s3

0      blue
2    yellow
4    purple
dtype: object

s3.reindex(range(6),method="ffill")

0      blue
1      blue
2    yellow
3    yellow
4    purple
5    purple
dtype: object

df=pd.DataFrame(np.arange(9).reshape(3,3),
                index=["a","c","d"],
                columns=["Tim","Tom","Kate"])
df

df2=df.reindex(["d","c","b","a"])
df2

names=["Kate","Tim","Tom"]
df.reindex(columns=names)

df.loc[["c","d","a"]]

s=pd.Series(np.arange(5.),
            index=["a","b","c","d","e"])
s

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

new_s=s.drop("b")
new_s

a    0.0
c    2.0
d    3.0
e    4.0
dtype: float64

s.drop(["c","d"])

a    0.0
b    1.0
e    4.0
dtype: float64

data=pd.DataFrame(np.arange(16).reshape(4,4),
                  index=["Kate","Tim",
                         "Tom","Alex"],
                  columns=list("ABCD"))
data

data.drop(["Kate","Tim"])

data.drop("A",axis=1)

data.drop("Kate",axis=0)

data

data.mean(axis="index")

A    6.0
B    7.0
C    8.0
D    9.0
dtype: float64

data.mean(axis="columns")

Kate     1.5
Tim      5.5
Tom      9.5
Alex    13.5
dtype: float64

data.mean(axis=None)

7.5

s1=pd.Series(np.arange(4),
             index=["a","c","d","e"])
s2=pd.Series(np.arange(5),
             index=["a","c","e","f","g"])

print(s1)
print(s2)

a    0
c    1
d    2
e    3
dtype: int64
a    0
c    1
e    2
f    3
g    4
dtype: int64

s1+s2

a    0.0
c    2.0
d    NaN
e    5.0
f    NaN
g    NaN
dtype: float64

df1=pd.DataFrame(
    np.arange(6).reshape(2,3),
    columns=list("ABC"),
    index=["Tim","Tom"])
df2=pd.DataFrame(
    np.arange(9).reshape(3,3),
    columns=list("ACD"),
    index=["Tim","Kate","Tom"])

print(df1)
print(df2)

     A  B  C
Tim  0  1  2
Tom  3  4  5
      A  C  D
Tim   0  1  2
Kate  3  4  5
Tom   6  7  8

df1+df2

df1.add(df2,fill_value=0)

df1

1/df1

df1/2

s=df2.iloc[1]
s

A    3
C    4
D    5
Name: Kate, dtype: int64

df2

df2-s

s2=df2["A"]
s2

Tim     0
Kate    3
Tom     6
Name: A, dtype: int64

df2.sub(s2,axis="index")

df2

df=pd.DataFrame(
    np.random.randn(4,3),
    columns=list("ABC"),
    index=["Kim","Susan","Tim","Tom"])
df

np.abs(df)

f=lambda x:x.max()-x.min()

df.apply(f)

A    4.551196
B    0.976784
C    0.943874
dtype: float64

df.apply(f,axis=1)

Kim      3.668393
Susan    1.249605
Tim      3.009383
Tom      1.644514
dtype: float64

def f(x):
    return x**2

df.apply(f)

s=pd.Series(range(5),
            index=["e","d","a","b","c"])
s

e    0
d    1
a    2
b    3
c    4
dtype: int64

s.sort_index()

a    2
b    3
c    4
d    1
e    0
dtype: int64

s.sort_index()

a    2
b    3
c    4
d    1
e    0
dtype: int64

df=pd.DataFrame(
    np.arange(12).reshape(3,4),
    index=["two","one","three"],
    columns=["d","a","b","c"])
df

df.sort_index()

	Rank	Name	Platform	Year	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales
0	1	Wii Sports	Wii	2006.0	Sports	Nintendo	41.49	29.02	3.77	8.46	82.74
1	2	Super Mario Bros.	NES	1985.0	Platform	Nintendo	29.08	3.58	6.81	0.77	40.24
2	3	Mario Kart Wii	Wii	2008.0	Racing	Nintendo	15.85	12.88	3.79	3.31	35.82
3	4	Wii Sports Resort	Wii	2009.0	Sports	Nintendo	15.75	11.01	3.28	2.96	33.00
4	5	Pokemon Red/Pokemon Blue	GB	1996.0	Role-Playing	Nintendo	11.27	8.89	10.22	1.00	31.37

	A	B	C
Tim	inf	1.00	0.5
Tom	0.333333	0.25	0.2

	A	B	C
Kim	2.554629	-1.113764	0.968447
Susan	0.596522	-0.653082	0.068941
Tim	-1.996567	-1.629866	1.012815
Tom	-0.250421	-1.260170	0.384344

	A	B	C
Kim	2.554629	1.113764	0.968447
Susan	0.596522	0.653082	0.068941
Tim	1.996567	1.629866	1.012815
Tom	0.250421	1.260170	0.384344

	A	B	C
Kim	6.526127	1.240471	0.937890
Susan	0.355839	0.426516	0.004753
Tim	3.986281	2.656463	1.025795
Tom	0.062710	1.588027	0.147720

Python Pandas¶

Series Data Structure¶

Working with Series Data Structure¶

DataFrame Data Structure¶

Indexing & Selection & Filtering¶

Useful Methods¶

Arithmetic Operations¶

Applying a Function¶

Sorting & Ranking¶

Reference¶

Year	1980.0	1981.0	1982.0	1983.0	1984.0	1985.0	1986.0	1987.0	1988.0	1989.0	...	2009.0	2010.0	2011.0	2012.0	2013.0	2014.0	2015.0	2016.0	2017.0	2020.0
Genre
Action	1	25	18	7	1	2	6	2	2	2	...	272	226	239	266	148	186	255	119	1	0
Adventure	0	0	0	1	0	0	0	1	0	0	...	141	154	108	58	60	75	54	34	0	0
Fighting	1	0	0	0	0	1	0	2	0	0	...	53	40	50	29	20	23	21	14	0	0
Misc	4	0	1	1	1	0	0	0	0	1	...	207	201	184	38	42	41	39	18	0	0
Platform	0	3	5	5	1	4	6	2	4	3	...	29	31	37	12	37	10	14	10	0	0
Puzzle	0	2	3	1	3	4	0	0	1	5	...	79	45	43	11	3	8	6	0	0	0
Racing	0	1	2	0	3	0	1	0	1	0	...	84	57	65	30	16	27	19	20	0	0
Role-Playing	0	0	0	0	0	0	1	3	3	2	...	103	103	95	78	71	91	78	40	2	0
Shooter	2	10	5	1	3	1	4	2	1	1	...	91	81	94	48	59	47	34	32	0	0
Simulation	0	1	0	0	0	1	0	0	1	0	...	123	82	56	18	18	11	15	9	0	1
Sports	1	4	2	1	2	1	3	4	2	3	...	184	186	122	54	53	55	62	38	0	0
Strategy	0	0	0	0	0	0	0	0	0	0	...	65	53	46	15	19	8	17	10	0	0

	name	score	sport	sex
0	Bill	90	Wrestling	M
1	Tom	80	Football	M
2	Tim	85	Skiing	M
3	John	75	Swimming	M
4	Alex	95	Tennis	F
5	Vanessa	60	Karete	F
6	Kate	65	Surfing	F

	name	sport	gender	score	age	pass
one	Bill	Wrestling	NaN	90	18	True
two	Tom	Football	NaN	80	19	True
three	Tim	Skiing	NaN	85	20	True
four	John	Swimming	NaN	75	18	True
five	Alex	Tennis	NaN	95	17	True
six	Vanessa	Karete	NaN	60	17	False
seven	Kate	Surfing	NaN	65	18	False

	Math	Physics
A	85	90
B	90	80
C	95	75

	A	B	C
Math	85	90	95
Physics	90	80	75

lesson	Math	Physics
name
A	85	90
B	90	80
C	95	75