import pandas as pd


my_series = pd.Series([5, 6, 7, 8, 9, 10])
my_series

0     5
1     6
2     7
3     8
4     9
5    10
dtype: int64


my_series.index

RangeIndex(start=0, stop=6, step=1)


my_series.values

array([ 5,  6,  7,  8,  9, 10], dtype=int64)


my_series2 = pd.Series([5, 6, 7, 8, 9, 10], index=['a', 'b', 'c', 'd', 'e', 'f'])
my_series2

a     5
b     6
c     7
d     8
e     9
f    10
dtype: int64


my_series2.index

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')


my_series[[4]]

4    9
dtype: int64


my_series2[['a']]

a    5
dtype: int64


my_series2[['a', 'b', 'f']] = 0
my_series2

a    0
b    0
c    7
d    8
e    9
f    0
dtype: int64


my_series2[(my_series2 > 0)]

c    7
d    8
e    9
dtype: int64


my_series3 = pd.Series({'a': 5, 'b': 6, 'c': 7, 'd': 8})
my_series3

a    5
b    6
c    7
d    8
dtype: int64


my_series3.name = 'numbers'
my_series3.index.name = 'letters'
my_series3

letters
a    5
b    6
c    7
d    8
Name: numbers, dtype: int64


df = pd.DataFrame({
    'country': ['Kazakhstan', 'Russia', 'Belarus', 'Ukraine'],
    'population': [17.04, 143.5, 9.5, 45.5],
    'area': [2724902, 17125191, 207600, 603628]
})
df


df.population  # A

0     17.04
1    143.50
2      9.50
3     45.50
Name: population, dtype: float64


df["population"]  # B

0     17.04
1    143.50
2      9.50
3     45.50
Name: population, dtype: float64

df


df.index = ['KZ', 'RU', 'BY', 'UA']
df


titanic = pd.read_csv("./titanic.csv", sep=",")
titanic.head(10)


c_size = 200 

for gm_chunk in pd.read_csv("./titanic.csv", sep = ",", chunksize=c_size):
    print(gm_chunk.shape)

(200, 12)
(200, 12)
(200, 12)
(200, 12)
(91, 12)


df.loc['KZ']

country       Kazakhstan
population         17.04
area             2724902
Name: KZ, dtype: object


df.iloc[0]

country       Kazakhstan
population         17.04
area             2724902
Name: KZ, dtype: object


df.loc[['KZ', 'RU'], 'population']

KZ     17.04
RU    143.50
Name: population, dtype: float64


df.iloc[[0, 1],1]

KZ     17.04
RU    143.50
Name: population, dtype: float64


df.iloc[[0, 1],[1]]


df.iloc[[0, 1],[1]]


titanic[titanic["Age"] > 60][["Name", "Ticket", "Fare"]].head(3)


titanic[(titanic.Age < 10) & (titanic.Sex == "male")][["Name", "Fare"]].head(4)


filters = (titanic.Pclass == 3)
titanic[filters].head()

df


df['density'] = df['population'] / df['area'] * 1000000
df


x = df.drop(['density'], axis=1)

df


df.drop(['density'], axis=1, inplace=True)
df


df.rename(columns={"population": "on_vacation", "weird": "even_stranger"})


titanic.nlargest(3, "Age")


titanic.nsmallest(3, "Fare")


import matplotlib.pyplot as plt
import seaborn as sns


import numpy as np

x = np.random.normal(size=100)
y = np.random.normal(size=100)


plt.scatter(x, y)

<matplotlib.collections.PathCollection at 0x15bd5eebfc8>


sns.set_style("darkgrid")


plt.scatter(x, y)

<matplotlib.collections.PathCollection at 0x15bd5eeba08>


tips = sns.load_dataset("tips")
tips.head()


sns.relplot(x="total_bill", y="tip", data=tips)

<seaborn.axisgrid.FacetGrid at 0x15bd769c608>


sns.relplot(x="total_bill", y="tip", hue="size", data=tips)

<seaborn.axisgrid.FacetGrid at 0x15bd769c288>


sns.relplot(x="total_bill", y="tip", size="size", data=tips)

<seaborn.axisgrid.FacetGrid at 0x15bd77bc6c8>


sns.relplot(x="total_bill", y="tip", hue="day", size="size", style="smoker", data=tips)

<seaborn.axisgrid.FacetGrid at 0x15bd785ef08>


sns.relplot(x="total_bill", y="tip", hue="size", col="smoker", row="time", data=tips,
            height=3)

<seaborn.axisgrid.FacetGrid at 0x15bd78c3408>


sns.displot(x="total_bill", data=tips)

<seaborn.axisgrid.FacetGrid at 0x15bd7a55cc8>


sns.displot(x="total_bill", kde=True, data=tips)

<seaborn.axisgrid.FacetGrid at 0x15bd7b06f48>


sns.displot(x="total_bill", y="tip", data=tips)

<seaborn.axisgrid.FacetGrid at 0x15bd7a6f108>


sns.displot(x="total_bill", y="tip", kind="kde", data=tips)

<seaborn.axisgrid.FacetGrid at 0x15bd7aff3c8>


sns.displot(x="total_bill", y="tip", hue="time", kind="kde", data=tips)

<seaborn.axisgrid.FacetGrid at 0x15bd7aff388>


sns.displot(x="total_bill", hue="smoker", data=tips,
            alpha=0.5)

<seaborn.axisgrid.FacetGrid at 0x15bd78b3e88>


sns.displot(x="total_bill", hue="smoker", data=tips,
            alpha=0.5, element="step")

<seaborn.axisgrid.FacetGrid at 0x15bd8db2b88>


sns.displot(x="total_bill", hue="day", data=tips,
            alpha=0.5, element="step")

<seaborn.axisgrid.FacetGrid at 0x15bd8e90788>


plt.figure(figsize=(6, 6))
sns.catplot(x="day", y="total_bill", kind="box", data=tips)

<seaborn.axisgrid.FacetGrid at 0x15bd8e4df88>

<Figure size 432x432 with 0 Axes>


plt.figure(figsize=(6, 6))
sns.catplot(x="day", y="total_bill", hue="smoker", kind="box", data=tips)

<seaborn.axisgrid.FacetGrid at 0x15bd90038c8>

<Figure size 432x432 with 0 Axes>


plt.figure(figsize=(6, 6))
sns.catplot(x="day", y="total_bill", kind="violin", data=tips)

<seaborn.axisgrid.FacetGrid at 0x15bd90eb848>

<Figure size 432x432 with 0 Axes>


plt.figure(figsize=(6, 6))
sns.catplot(x="day", y="total_bill", kind="boxen", data=tips)

<seaborn.axisgrid.FacetGrid at 0x15bd917d088>

<Figure size 432x432 with 0 Axes>


plt.figure(figsize=(6, 6))
sns.catplot(x="day", y="total_bill", kind="strip", data=tips)

<seaborn.axisgrid.FacetGrid at 0x15bd9203a48>

<Figure size 432x432 with 0 Axes>


plt.figure(figsize=(6, 6))
sns.catplot(x="day", y="total_bill", kind="swarm", data=tips)

<seaborn.axisgrid.FacetGrid at 0x15bd926f508>

<Figure size 432x432 with 0 Axes>


car_crashes = sns.load_dataset("car_crashes")
car_crashes.head(3)


plt.figure(figsize=(6, 5))
sns.heatmap(car_crashes.corr())

<AxesSubplot:>


plt.figure(figsize=(6, 5))
sns.heatmap(car_crashes.corr(), cmap="seismic", center=0, linewidths=.5)

<AxesSubplot:>


df = pd.read_csv("./titanic.csv", sep=",")
# df = sns.load_dataset("titanic")


df.head(5)


df.shape

(891, 12)


df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


df.describe()


df.describe(include=['object'])


df["Sex"].value_counts()

male      577
female    314
Name: Sex, dtype: int64


print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


df.sort_values(by=['Pclass', 'Fare'], ascending=[True, False]).head(8)


df[["Age", "Fare"]].apply(np.max)

Age      80.0000
Fare    512.3292
dtype: float64


df[["Age", "Fare"]].apply(np.max, axis=0)

Age      80.0000
Fare    512.3292
dtype: float64


df[["Age", "Fare"]].apply(np.max, axis=1).head(4)  # senseless usage

0    22.0000
1    71.2833
2    26.0000
3    53.1000
dtype: float64


df.Fare.apply(np.log2).head(10)

0    2.857981
1    6.155492
2    2.986411
3    5.730640
4    3.008989
5    3.080368
6    5.696620
7    4.397461
8    3.476809
9    4.910291
Name: Fare, dtype: float64


is_abe = lambda x: "Abraham" in x


abrahams = df[df['Name'].apply(is_abe)]
abrahams


abrahams["Fare"].mean()

23.46875


df.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64


sex_code = {'female' : 1, 'male' : 0}


df['Sex_code'] = df['Sex'].map(sex_code)
df.tail(3)


df.groupby(by="Embarked")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000015BDA468BC8>


df.groupby(by="Embarked")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000015BDA444D48>


df.groupby(by=["Embarked", "Sex"])["Fare"].mean()

Embarked  Sex   
C         female    75.169805
          male      48.262109
Q         female    12.634958
          male      13.838922
S         female    38.740929
          male      21.711996
Name: Fare, dtype: float64


np.mean(df.groupby(by=["Embarked", "Sex"])["Fare"])

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-87-c53e3acceb4f> in <module>
----> 1 np.mean(df.groupby(by=["Embarked", "Sex"])["Fare"])

<__array_function__ internals> in mean(*args, **kwargs)

~\Anaconda3\lib\site-packages\numpy\core\fromnumeric.py in mean(a, axis, dtype, out, keepdims, where)
   3415             pass
   3416         else:
-> 3417             return mean(axis=axis, dtype=dtype, out=out, **kwargs)
   3418 
   3419     return _methods._mean(a, axis=axis, dtype=dtype,

TypeError: mean() got an unexpected keyword argument 'axis'


df.groupby(by=["Embarked", "Sex"])["Fare"].agg(np.mean)

Embarked  Sex   
C         female    75.169805
          male      48.262109
Q         female    12.634958
          male      13.838922
S         female    38.740929
          male      21.711996
Name: Fare, dtype: float64


df.groupby(by=["Embarked", "Sex"])["Fare"].agg([np.mean, np.std, np.min, np.max])


df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Sex_code'],
      dtype='object')


pd.crosstab(df["Sex"], df["Pclass"])


pd.crosstab(df["Sex"], df["Pclass"], normalize=True)


np.nan in [np.nan]

True


np.nan == np.nan

False


np.nan is np.nan

True


np.nan + 19

nan


np.nan - np.nan

nan


np.isnan(np.nan), np.isnan(False)

(True, False)


np.isnan(None)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-99-f53008f0bd14> in <module>
----> 1 np.isnan(None)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''


pd.isna(np.nan), pd.isnull(np.nan)

(True, True)


pd.isna is pd.isnull

True


a = np.array([1.0, 2.5, np.nan, np.pi])


np.isnan(a)

array([False, False,  True, False])


pd.isna(a)

array([False, False,  True, False])


b = pd.Series(["alpha", "beta", np.nan])


np.isnan(b)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-106-43c8483872d1> in <module>
----> 1 np.isnan(b)

~\Anaconda3\lib\site-packages\pandas\core\generic.py in __array_ufunc__(self, ufunc, method, *inputs, **kwargs)
   1934         self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any
   1935     ):
-> 1936         return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs)
   1937 
   1938     # ideally we would define this to avoid the getattr checks, but

~\Anaconda3\lib\site-packages\pandas\core\arraylike.py in array_ufunc(self, ufunc, method, *inputs, **kwargs)
    356         # ufunc(series, ...)
    357         inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
--> 358         result = getattr(ufunc, method)(*inputs, **kwargs)
    359     else:
    360         # ufunc(dataframe)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''


b = pd.Series(["alpha", "beta", np.nan])


pd.isna(b)

0    False
1    False
2     True
dtype: bool


~pd.isna(df).any(axis=1)

0      False
1       True
2      False
3       True
4      False
       ...  
886    False
887     True
888    False
889     True
890    False
Length: 891, dtype: bool


df[~pd.isna(df).any(axis=1)].head(4)

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
5	6	0	3	Moran, Mr. James	male	NaN	0	0	330877	8.4583	NaN	Q
6	7	0	1	McCarthy, Mr. Timothy J	male	54.0	0	0	17463	51.8625	E46	S
7	8	0	3	Palsson, Master. Gosta Leonard	male	2.0	3	1	349909	21.0750	NaN	S
8	9	1	3	Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)	female	27.0	0	2	347742	11.1333	NaN	S
9	10	1	2	Nasser, Mrs. Nicholas (Adele Achem)	female	14.0	1	0	237736	30.0708	NaN	C

	Name	Ticket	Fare
33	Wheadon, Mr. Edward H	C.A. 24579	10.5000
54	Ostby, Mr. Engelhart Cornelius	113509	61.9792
96	Goldschmidt, Mr. George B	PC 17754	34.6542

	Name	Fare
7	Palsson, Master. Gosta Leonard	21.0750
16	Rice, Master. Eugene	29.1250
50	Panula, Master. Juha Niilo	39.6875
63	Skoog, Master. Harald	27.9000

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
5	6	0	3	Moran, Mr. James	male	NaN	0	0	330877	8.4583	NaN	Q
7	8	0	3	Palsson, Master. Gosta Leonard	male	2.0	3	1	349909	21.0750	NaN	S

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

Библиотека Pandas¶

Основы Pandas¶

Что дает нам Pandas?¶

`pandas.Series`¶

`pandas.DataFrame`¶

Почему строки, а не столбцы?¶

Импортируем данные!¶

Экспортируем данные!¶

Запрашиваем данные у `DataFrame`¶

В каком случае возвращается `Series`, а в каком `DataFrame`?¶

Правильный ответ D¶

Seaborn¶

Какие графики есть в Seaborn?¶

Вспомним нашу схему выбора типа графика:¶

`sns.relplot`¶

Графики для распределений:¶

Какую из разновидностей выбрать?¶

Делаем очень базовый EDA в Pandas¶

Что такое `NaN`?¶

Quiz™¶

Как выкинуть строки с NaN из таблицы?¶

Общее правило, как понять, делает ли функция то, что вам нужно:¶

На этом всё!¶

	country	population	area
0	Kazakhstan	17.04	2724902
1	Russia	143.50	17125191
2	Belarus	9.50	207600
3	Ukraine	45.50	603628

	PassengerId	Survived	Pclass	Name	Sex	Age	Ticket	Fare	Cabin	Embarked
630	631	1	1	Barkworth, Mr. Algernon Henry Wilson	male	80.0	27042	30.0000	A23	S
851	852	0	3	Svensson, Mr. Johan	male	74.0	347060	7.7750	NaN	S
96	97	0	1	Goldschmidt, Mr. George B	male	71.0	PC 17754	34.6542	A5	C

	PassengerId	Survived	Pclass	Name	Sex	Age	Ticket	Cabin	Embarked
179	180	0	3	Leonard, Mr. Lionel	male	36.0	LINE	NaN	S
263	264	0	1	Harrison, Mr. William	male	40.0	112059	B94	S
271	272	1	3	Tornquist, Mr. William Henry	male	25.0	LINE	NaN	S

	total	speeding	alcohol	not_distracted	no_previous	ins_premium	ins_losses	abbrev
0	18.8	7.332	5.640	18.048	15.040	784.55	145.08	AL
1	18.1	7.421	4.525	16.290	17.014	1053.48	133.93	AK
2	18.6	6.510	5.208	15.624	17.856	899.47	110.35	AZ

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

	Name	Sex	Ticket	Cabin	Embarked
count	891	891	891	204	889
unique	891	2	681	147	3
top	McCoy, Mr. Bernard	male	CA. 2343	G6	S
freq	1	577	7	4	644

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
258	259	1	1	Ward, Miss. Anna	female	35.0	0	0	PC 17755	512.3292	NaN	C
679	680	1	1	Cardeza, Mr. Thomas Drake Martinez	male	36.0	0	1	PC 17755	512.3292	B51 B53 B55	C
737	738	1	1	Lesurer, Mr. Gustave J	male	35.0	0	0	PC 17755	512.3292	B101	C
27	28	0	1	Fortune, Mr. Charles Alexander	male	19.0	3	2	19950	263.0000	C23 C25 C27	S
88	89	1	1	Fortune, Miss. Mabel Helen	female	23.0	3	2	19950	263.0000	C23 C25 C27	S
341	342	1	1	Fortune, Miss. Alice Elizabeth	female	24.0	3	2	19950	263.0000	C23 C25 C27	S
438	439	0	1	Fortune, Mr. Mark	male	64.0	1	4	19950	263.0000	C23 C25 C27	S
311	312	1	1	Ryerson, Miss. Emily Borie	female	18.0	2	2	PC 17608	262.3750	B57 B59 B63 B66	C

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
785	786	0	3	Harmer, Mr. Abraham (David Lishin)	male	25.0	0	0	374887	7.2500	NaN	S
824	825	0	3	Panula, Master. Urho Abraham	male	2.0	4	1	3101295	39.6875	NaN	S

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked	Sex_code
888	889	0	3	Johnston, Miss. Catherine Helen "Carrie"	female	NaN	1	2	W./C. 6607	23.45	NaN	S	1
889	890	1	1	Behr, Mr. Karl Howell	male	26.0	0	0	111369	30.00	C148	C	0
890	891	0	3	Dooley, Mr. Patrick	male	32.0	0	0	370376	7.75	NaN	Q	0

		mean	std	amin	amax
Embarked	Sex
C	female	75.169805	83.574380	7.2250	512.3292
C	male	48.262109	82.715093	4.0125	512.3292
Q	female	12.634958	14.298841	6.7500	90.0000
Q	male	13.838922	14.243486	6.7500	90.0000
S	female	38.740929	46.047877	7.2500	263.0000
S	male	21.711996	28.584699	0.0000	263.0000

Pclass	1	2	3
Sex
female	94	76	144
male	122	108	347

Pclass	1	2	3
Sex
female	0.105499	0.085297	0.161616
male	0.136925	0.121212	0.389450

Библиотека Pandas¶

Основы Pandas¶

Что дает нам Pandas?¶

pandas.Series¶

pandas.DataFrame¶

Почему строки, а не столбцы?¶

Импортируем данные!¶

Экспортируем данные!¶

Запрашиваем данные у DataFrame¶

В каком случае возвращается Series, а в каком DataFrame?¶

Правильный ответ D¶

Seaborn¶

Какие графики есть в Seaborn?¶

Вспомним нашу схему выбора типа графика:¶

sns.relplot¶

Графики для распределений:¶

Какую из разновидностей выбрать?¶

Делаем очень базовый EDA в Pandas¶

Что такое NaN?¶

Quiz™¶

Как выкинуть строки с NaN из таблицы?¶

Общее правило, как понять, делает ли функция то, что вам нужно:¶

На этом всё!¶

`pandas.Series`¶

`pandas.DataFrame`¶

Запрашиваем данные у `DataFrame`¶

В каком случае возвращается `Series`, а в каком `DataFrame`?¶

`sns.relplot`¶

Что такое `NaN`?¶