סדנאת פייתון לפיזיקאים

דוגמה: מגמות למתן שמות בארצות הברית והחבילה Pandas

הפקולטה לפיזיקה, הטכניון. חורף 2013

מרצה: רונן אברבנאל

, אז פשוט נציג כאן דוגמה (מתוך הספר Python for Data Analysis)

Python for data analysis cover

Python for data analysis cover

הנתונים מתוך: https://github.com/pydata/pydata-book/tree/master/ch02/names ניתן להוריד אותם גם מכאן.

In [5]:
%pylab inline
Populating the interactive namespace from numpy and matplotlib

In [7]:
import os
os.chdir("/home/ronen/temp/pydata-book/ch02/names")
In [8]:
!head -n 10 yob1880.txt
Mary,F,7065

Anna,F,2604

Emma,F,2003

Elizabeth,F,1939

Minnie,F,1746

Margaret,F,1578

Ida,F,1472

Alice,F,1414

Bertha,F,1320

Sarah,F,1288


In [9]:
import pandas as pd
In [10]:
names1880 = pd.read_csv('yob1880.txt',names=['name','sex','births'])
In [11]:
names1880
Out[11]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Data columns (total 3 columns):
name      2000  non-null values
sex       2000  non-null values
births    2000  non-null values
dtypes: int64(1), object(2)
In [12]:
names1880[:5]
Out[12]:
name sex births
0 Mary F 7065
1 Anna F 2604
2 Emma F 2003
3 Elizabeth F 1939
4 Minnie F 1746
In [13]:
names1880.groupby('sex').births.sum()
Out[13]:
sex
F       90993
M      110493
Name: births, dtype: int64
In [14]:
years = range(1880,2011)
pieces = []
columns = ['name','sex','births']
for year in years:
    path = 'yob%d.txt' %year
    frame = pd.read_csv(path,names=columns)
    
    frame['year'] = year
    pieces.append(frame)
names = pd.concat(pieces, ignore_index=True)
names
Out[14]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1690784 entries, 0 to 1690783
Data columns (total 4 columns):
name      1690784  non-null values
sex       1690784  non-null values
births    1690784  non-null values
year      1690784  non-null values
dtypes: int64(2), object(2)
In [15]:
names[:5]
Out[15]:
name sex births year
0 Mary F 7065 1880
1 Anna F 2604 1880
2 Emma F 2003 1880
3 Elizabeth F 1939 1880
4 Minnie F 1746 1880
In [16]:
total_births = names.pivot_table('births',rows='year',cols='sex',aggfunc=sum)
In [17]:
total_births.tail()
Out[17]:
sex F M
year
2006 1896468 2050234
2007 1916888 2069242
2008 1883645 2032310
2009 1827643 1973359
2010 1759010 1898382
In [18]:
total_births.plot(title='Total births by sex and year')
Out[18]:
<matplotlib.axes.AxesSubplot at 0x301ed50>
In [19]:
def add_prop(group):
    births = group.births.astype(float)
    
    group['prop'] = births/births.sum()
    return group
names = names.groupby(['year','sex']).apply(add_prop)
In [20]:
names.tail()
Out[20]:
name sex births year prop
1690779 Zymaire M 5 2010 0.000003
1690780 Zyonne M 5 2010 0.000003
1690781 Zyquarius M 5 2010 0.000003
1690782 Zyran M 5 2010 0.000003
1690783 Zzyzx M 5 2010 0.000003
In [21]:
def get_top1000(group):
    return group.sort_index(by='births',ascending=False)[:1000]
In [22]:
grouped = names.groupby(['year','sex'])
top1000 = grouped.apply(get_top1000)
top1000
Out[22]:
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 261877 entries, (1880, F, 0) to (2010, M, 1677643)
Data columns (total 5 columns):
name      261877  non-null values
sex       261877  non-null values
births    261877  non-null values
year      261877  non-null values
prop      261877  non-null values
dtypes: float64(1), int64(2), object(2)
In [23]:
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex=='F']

total_births = names.pivot_table('births',rows='year', cols='name', aggfunc=sum)
In [24]:
total_births
Out[24]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 131 entries, 1880 to 2010
Columns: 88496 entries, Aaban to Zzyzx
dtypes: float64(88496)
In [25]:
subset = total_births[['John','Harry','Mary','Marilyn']]
subset.plot(subplots=True,figsize=(12,10),grid=False)
Out[25]:
array([<matplotlib.axes.AxesSubplot object at 0xb4b5990>,
       <matplotlib.axes.AxesSubplot object at 0xb4d4390>,
       <matplotlib.axes.AxesSubplot object at 0xb220e90>,
       <matplotlib.axes.AxesSubplot object at 0xb2343d0>], dtype=object)
In [26]:
table = top1000.pivot_table('prop',rows='year', cols='sex', aggfunc=sum)
table.plot(title='Sum of...', yticks=np.linspace(0,1.2,13), xticks=range(1880,2020,10))
Out[26]:
<matplotlib.axes.AxesSubplot at 0xd4dd690>
In [27]:
all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]

lesley_like
Out[27]:
array(['Leslie', 'Lesley', 'Leslee', 'Lesli', 'Lesly'], dtype=object)
In [28]:
filterd = top1000[top1000.name.isin(lesley_like)]
filterd.groupby('name').births.sum()
Out[28]:
name
Leslee      1082
Lesley     35022
Lesli        929
Leslie    370429
Lesly      10067
Name: births, dtype: int64
In [29]:
table = filterd.pivot_table('births',rows='year', cols='sex',aggfunc='sum')
In [30]:
table = table.div(table.sum(1),axis=0)
table.tail()
Out[30]:
sex F M
year
2006 1 NaN
2007 1 NaN
2008 1 NaN
2009 1 NaN
2010 1 NaN
In [27]:
table.plot()
Out[27]:
<matplotlib.axes.AxesSubplot at 0xd716750>
In []: