הפקולטה לפיזיקה, הטכניון. חורף 2013
מרצה: רונן אברבנאל
, אז פשוט נציג כאן דוגמה (מתוך הספר Python for Data Analysis)
Python for data analysis cover
הנתונים מתוך: https://github.com/pydata/pydata-book/tree/master/ch02/names ניתן להוריד אותם גם מכאן.
%pylab inline
Populating the interactive namespace from numpy and matplotlib
import os
os.chdir("/home/ronen/temp/pydata-book/ch02/names")
!head -n 10 yob1880.txt
Mary,F,7065 Anna,F,2604 Emma,F,2003 Elizabeth,F,1939 Minnie,F,1746 Margaret,F,1578 Ida,F,1472 Alice,F,1414 Bertha,F,1320 Sarah,F,1288
import pandas as pd
names1880 = pd.read_csv('yob1880.txt',names=['name','sex','births'])
names1880
<class 'pandas.core.frame.DataFrame'> Int64Index: 2000 entries, 0 to 1999 Data columns (total 3 columns): name 2000 non-null values sex 2000 non-null values births 2000 non-null values dtypes: int64(1), object(2)
names1880[:5]
name | sex | births | |
---|---|---|---|
0 | Mary | F | 7065 |
1 | Anna | F | 2604 |
2 | Emma | F | 2003 |
3 | Elizabeth | F | 1939 |
4 | Minnie | F | 1746 |
names1880.groupby('sex').births.sum()
sex F 90993 M 110493 Name: births, dtype: int64
years = range(1880,2011)
pieces = []
columns = ['name','sex','births']
for year in years:
path = 'yob%d.txt' %year
frame = pd.read_csv(path,names=columns)
frame['year'] = year
pieces.append(frame)
names = pd.concat(pieces, ignore_index=True)
names
<class 'pandas.core.frame.DataFrame'> Int64Index: 1690784 entries, 0 to 1690783 Data columns (total 4 columns): name 1690784 non-null values sex 1690784 non-null values births 1690784 non-null values year 1690784 non-null values dtypes: int64(2), object(2)
names[:5]
name | sex | births | year | |
---|---|---|---|---|
0 | Mary | F | 7065 | 1880 |
1 | Anna | F | 2604 | 1880 |
2 | Emma | F | 2003 | 1880 |
3 | Elizabeth | F | 1939 | 1880 |
4 | Minnie | F | 1746 | 1880 |
total_births = names.pivot_table('births',rows='year',cols='sex',aggfunc=sum)
total_births.tail()
sex | F | M |
---|---|---|
year | ||
2006 | 1896468 | 2050234 |
2007 | 1916888 | 2069242 |
2008 | 1883645 | 2032310 |
2009 | 1827643 | 1973359 |
2010 | 1759010 | 1898382 |
total_births.plot(title='Total births by sex and year')
<matplotlib.axes.AxesSubplot at 0x301ed50>
def add_prop(group):
births = group.births.astype(float)
group['prop'] = births/births.sum()
return group
names = names.groupby(['year','sex']).apply(add_prop)
names.tail()
name | sex | births | year | prop | |
---|---|---|---|---|---|
1690779 | Zymaire | M | 5 | 2010 | 0.000003 |
1690780 | Zyonne | M | 5 | 2010 | 0.000003 |
1690781 | Zyquarius | M | 5 | 2010 | 0.000003 |
1690782 | Zyran | M | 5 | 2010 | 0.000003 |
1690783 | Zzyzx | M | 5 | 2010 | 0.000003 |
def get_top1000(group):
return group.sort_index(by='births',ascending=False)[:1000]
grouped = names.groupby(['year','sex'])
top1000 = grouped.apply(get_top1000)
top1000
<class 'pandas.core.frame.DataFrame'> MultiIndex: 261877 entries, (1880, F, 0) to (2010, M, 1677643) Data columns (total 5 columns): name 261877 non-null values sex 261877 non-null values births 261877 non-null values year 261877 non-null values prop 261877 non-null values dtypes: float64(1), int64(2), object(2)
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex=='F']
total_births = names.pivot_table('births',rows='year', cols='name', aggfunc=sum)
total_births
<class 'pandas.core.frame.DataFrame'> Int64Index: 131 entries, 1880 to 2010 Columns: 88496 entries, Aaban to Zzyzx dtypes: float64(88496)
subset = total_births[['John','Harry','Mary','Marilyn']]
subset.plot(subplots=True,figsize=(12,10),grid=False)
array([<matplotlib.axes.AxesSubplot object at 0xb4b5990>, <matplotlib.axes.AxesSubplot object at 0xb4d4390>, <matplotlib.axes.AxesSubplot object at 0xb220e90>, <matplotlib.axes.AxesSubplot object at 0xb2343d0>], dtype=object)
table = top1000.pivot_table('prop',rows='year', cols='sex', aggfunc=sum)
table.plot(title='Sum of...', yticks=np.linspace(0,1.2,13), xticks=range(1880,2020,10))
<matplotlib.axes.AxesSubplot at 0xd4dd690>
all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
lesley_like
array(['Leslie', 'Lesley', 'Leslee', 'Lesli', 'Lesly'], dtype=object)
filterd = top1000[top1000.name.isin(lesley_like)]
filterd.groupby('name').births.sum()
name Leslee 1082 Lesley 35022 Lesli 929 Leslie 370429 Lesly 10067 Name: births, dtype: int64
table = filterd.pivot_table('births',rows='year', cols='sex',aggfunc='sum')
table = table.div(table.sum(1),axis=0)
table.tail()
sex | F | M |
---|---|---|
year | ||
2006 | 1 | NaN |
2007 | 1 | NaN |
2008 | 1 | NaN |
2009 | 1 | NaN |
2010 | 1 | NaN |
table.plot()
<matplotlib.axes.AxesSubplot at 0xd716750>