-
Notifications
You must be signed in to change notification settings - Fork 1
/
03-datastructures.tex
404 lines (357 loc) · 11.5 KB
/
03-datastructures.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
\documentclass[MASTER.tex]{subfiles}
\begin{document}
%----------------------------------------------------------------------------------------------------%
% \section{Section 3 : Data Structures}
\begin{frame}[fragile]
\frametitle{Data Structures}
\textbf{\textit{pandas}} introduces two new data structures to Python - \textbf{Series} and \textbf{DataFrame}, both of which are built on top of NumPy.
\begin{framed}
\begin{verbatim}
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('max_columns', 50)
\end{verbatim}
\end{framed}
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
\frametitle{Series}
Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.). The axis labels are collectively referred to as the index. The basic method to create a Series is to call:
\begin{framed}
\begin{verbatim}
s = Series(data, index=index)
\end{verbatim}
\end{framed}
Here, data can be many different things:
\begin{itemize}
\item a Python \texttt{dict}
\item an \texttt{ndarray}
\item a scalar value (like 5)
\end{itemize}
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
\begin{itemize}
\item A Series is a one-dimensional object similar to an array, list, or column in a table.
\item It will assign a labeled index to each item in the Series. \item By default, each item will receive an index label from 0 to N, where N is the length of the Series minus one.
\end{itemize}
\begin{framed}
\begin{verbatim}
# create a Series with an arbitrary list
s = pd.Series([7, 'Heisenberg', 3.14, -1789710578,
'Happy Eating!'])
s
\end{verbatim}
\end{framed}
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
\frametitle{Series}
\textbf{Output from Previous Slide}
\begin{framed}
\begin{verbatim}
0 7
1 Heisenberg
2 3.14
3 -1789710578
4 Happy Eating!
dtype: object
\end{verbatim}
\end{framed}
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
Alternatively, you can specify an index to use when creating the Series.
\begin{framed}
\begin{verbatim}
s = pd.Series([7, 'Heisenberg', 3.14, -1789710578,
'Happy Eating!'],
index=['A', 'Z', 'C', 'Y', 'E'])
s
\end{verbatim}
\end{framed}
\begin{verbatim}
A 7
Z Heisenberg
C 3.14
Y -1789710578
E Happy Eating!
dtype: object
\end{verbatim}
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
\frametitle{Series}
The Series constructor can convert a dictonary as well, using the keys of the dictionary as its index.
\begin{framed}
\begin{verbatim}
d = {'Chicago': 1000, 'New York': 1300, 'Portland': 900, 'San Francisco': 1100,
'Austin': 450, 'Boston': None}
cities = pd.Series(d)
cities
Out[4]:
Austin 450
Boston NaN
Chicago 1000
New York 1300
Portland 900
San Francisco 1100
dtype: float64
\end{verbatim}
\end{framed}
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
\frametitle{Series}
You can use the index to select specific items from the Series ...
\begin{framed}
\begin{verbatim}
cities['Chicago']
Out[5]:
1000.0
\end{verbatim}
\end{framed}
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
\frametitle{Series}
\begin{framed}
\begin{verbatim}
cities[['Chicago', 'Portland', 'San Francisco']]
Out[6]:
Chicago 1000
Portland 900
San Francisco 1100
dtype: float64
\end{verbatim}
\end{framed}
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
\frametitle{Series}
You can use \textbf{\textit{boolean indexing}} for selection.
\begin{framed}
\begin{verbatim}
cities[cities < 1000]
Out[7]:
Austin 450
Portland 900
dtype: float64
\end{verbatim}
\end{framed}
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
That last one might be a little strange, so let's make it more clear - \texttt{cities < 1000} returns a Series of \texttt{True/False} values, which we then pass to our Series cities, returning the corresponding \texttt{True} items.
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
\begin{framed}
\begin{verbatim}
less_than_1000 = cities < 1000
print less_than_1000
print '\n'
print cities[less_than_1000]
Austin True
Boston False
Chicago False
New York False
Portland True
San Francisco False
dtype: bool
Austin 450
Portland 900
dtype: float64
\end{verbatim}
\end{framed}
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
You can also change the values in a Series on the fly.
\begin{framed}
\begin{verbatim}
# changing based on the index
print 'Old value:', cities['Chicago']
cities['Chicago'] = 1400
print 'New value:', cities['Chicago']
Old value: 1000.0
New value: 1400.0
\end{verbatim}
\end{framed}
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
Changing values using boolean logic
\begin{framed}
\begin{verbatim}
print cities[cities < 1000]
print '\n'
cities[cities < 1000] = 750
print cities[cities < 1000]
Austin 450
Portland 900
dtype: float64
Austin 750
Portland 750
dtype: float64
\end{verbatim}
\end{framed}
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
\frametitle{Working with Series}
What if you aren't sure whether an item is in the Series? You can check using idiomatic Python.
\begin{framed}
\begin{verbatim}
print 'Seattle' in cities
print 'San Francisco' in cities
False
True
\end{verbatim}
\end{framed}
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
Mathematical operations can be done using scalars and functions.
\begin{framed}
\begin{verbatim}
# divide city values by 3
cities / 3
Out[12]:
Austin 250.000000
Boston NaN
Chicago 466.666667
New York 433.333333
Portland 250.000000
San Francisco 366.666667
dtype: float64
\end{verbatim}
\end{framed}
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
\begin{framed}
\begin{verbatim}
# square city values
np.square(cities)
Out[13]:
Austin 562500
Boston NaN
Chicago 1960000
New York 1690000
Portland 562500
San Francisco 1210000
dtype: float64
\end{verbatim}
\end{framed}
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
You can add two Series together, which returns a union of the two Series with the addition occurring on the shared index values. Values on either Series that did not have a shared index will produce a NULL/NaN (not a number).
\begin{framed}
\begin{verbatim}
print cities[['Chicago', 'New York', 'Portland']]
print'\n'
print cities[['Austin', 'New York']]
print'\n'
print cities[['Chicago', 'New York', 'Portland']] + cities[['Austin', 'New York']]
\end{verbatim}
\end{framed}
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
\begin{verbatim}
Chicago 1400
New York 1300
Portland 750
dtype: float64
Austin 750
New York 1300
dtype: float64
Austin NaN
Chicago NaN
New York 2600
Portland NaN
dtype: float64
\end{verbatim}
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
\frametitle{Working with Series}
\textbf{NULL Checking}
\begin{itemize}
\item Notice that because Austin, Chicago, and Portland were not found in both Series, they were returned with NULL/NaN values.
\item NULL checking can be performed with \texttt{isnull()} and \texttt{notnull()}.
\end{itemize}
\end{frame}
%=======================================================================================%
\begin{frame}[fragile]
Return a boolean series indicating which values aren't NULL
\begin{framed}
\begin{verbatim}
cities.notnull()
Austin True
Boston False
Chicago True
New York True
Portland True
San Francisco True
dtype: bool
\end{verbatim}
\end{framed}
\end{frame}
%=======================================================================================%
%=======================================================================================%
\begin{frame}[fragile]
Using boolean logic to grab the NULL cities
\begin{framed}
\begin{verbatim}
print cities.isnull()
print '\n'
print cities[cities.isnull()]
Austin False
Boston True
Chicago False
New York False
Portland False
San Francisco False
dtype: bool
Boston NaN
dtype: float64
\end{verbatim}
\end{framed}
\end{frame}
%=======================================================================================%
\end{document}
%%---------------------------------------%
%\newpage
%\frametitle{DataFrame}
%
%% pandas - chapter 5 - DataFrame
%
%A DataFrame is a tablular data structure comprised of rows and columns, akin to a spreadsheet, database table, or R's data.frame object. You can also think of a DataFrame as a group of Series objects that share an index (the column names).
%
%%For the rest of the tutorial, we'll be primarily working with DataFrames.
%
%%---------------------------------------%
%\newpage
%\frametitle{Panel}
%
%
%
%\texttt{Panel} is a somewhat less-used, but still important container for 3-dimensional data.
%The term panel data is derived from econometrics and is partially responsible for the name pandas: pan(el)-da(ta)-s.
%The names for the 3 axes are intended to give some semantic meaning to describing operations involving panel data and,
%in particular, econometric analysis of panel data. However, for the strict purposes of slicing and dicing a
%collection of DataFrame objects, you may find the axis names slightly arbitrary:
%
%\begin{itemize}
%\item items: axis 0, each item corresponds to a DataFrame contained inside
%\item major\_axis: axis 1, it is the index (rows) of each of the DataFrames
%\item minor\_axis: axis 2, it is the columns of each of the DataFrames
%\end{itemize}
%
%\newpage
%
%\end{document}