import math
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as pp
%matplotlib inline
pd.options.display.max_rows = 8
open('names/yob2011.txt','r').readlines()[:10]
['Sophia,F,21842\n', 'Isabella,F,19910\n', 'Emma,F,18803\n', 'Olivia,F,17322\n', 'Ava,F,15503\n', 'Emily,F,14258\n', 'Abigail,F,13248\n', 'Madison,F,12389\n', 'Mia,F,11530\n', 'Chloe,F,10991\n']
pd.read_csv('names/yob2011.txt')
Sophia | F | 21842 | |
---|---|---|---|
0 | Isabella | F | 19910 |
1 | Emma | F | 18803 |
2 | Olivia | F | 17322 |
3 | Ava | F | 15503 |
... | ... | ... | ... |
33903 | Zylas | M | 5 |
33904 | Zyran | M | 5 |
33905 | Zyshawn | M | 5 |
33906 | Zytavion | M | 5 |
33907 rows × 3 columns
pd.read_csv('names/yob2011.txt',names=['name','sex','number'])
name | sex | number | |
---|---|---|---|
0 | Sophia | F | 21842 |
1 | Isabella | F | 19910 |
2 | Emma | F | 18803 |
... | ... | ... | ... |
33905 | Zyran | M | 5 |
33906 | Zyshawn | M | 5 |
33907 | Zytavion | M | 5 |
33908 rows × 3 columns
pd.read_csv('names/yob2011.txt',names=['name','sex','number']).assign(year=2011)
name | sex | number | year | |
---|---|---|---|---|
0 | Sophia | F | 21842 | 2011 |
1 | Isabella | F | 19910 | 2011 |
2 | Emma | F | 18803 | 2011 |
... | ... | ... | ... | ... |
33905 | Zyran | M | 5 | 2011 |
33906 | Zyshawn | M | 5 | 2011 |
33907 | Zytavion | M | 5 | 2011 |
33908 rows × 4 columns
allyears = pd.concat(pd.read_csv(f'names/yob{year}.txt',names=['name','sex','number']).assign(year=year)
for year in range(1880, 2019))
allyears.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1957046 entries, 0 to 32032 Data columns (total 4 columns): # Column Dtype --- ------ ----- 0 name object 1 sex object 2 number int64 3 year int64 dtypes: int64(2), object(2) memory usage: 59.7+ MB
allyears.year.min(), allyears.year.max()
(1880, 2018)
allyears.to_csv('allyears.csv.zip',index=False)
pd.options.display.max_rows = 6
allyears = pd.read_csv('allyears.csv.zip')
allyears
name | sex | number | year | |
---|---|---|---|---|
0 | Mary | F | 7065 | 1880 |
1 | Anna | F | 2604 | 1880 |
2 | Emma | F | 2003 | 1880 |
... | ... | ... | ... | ... |
1957043 | Zyrie | M | 5 | 2018 |
1957044 | Zyron | M | 5 | 2018 |
1957045 | Zzyzx | M | 5 | 2018 |
1957046 rows × 4 columns
allyears_indexed = allyears.set_index(['sex','name','year']).sort_index()
pp.plot(allyears_indexed.loc[('F','Mary')])
[<matplotlib.lines.Line2D at 0xb61fe30>]
pp.plot(allyears_indexed.loc[('F','Mary')] / allyears.groupby('year').sum())
[<matplotlib.lines.Line2D at 0xa843150>]
def plotname(sex, name):
data = allyears_indexed.loc[(sex,name)]
pp.plot(data.index, data.values, label=name)
pp.axis(xmin=1880, xmax=2018)
def comparenames(sex, names):
pp.figure(figsize=(12,2.5))
for name in names:
plotname(sex, name)
pp.legend()
comparenames('M' ,['Michael','John','David','Martin'])
comparenames('F' ,['Emily','Anna','Claire','Elizabeth'])
claires = ['Claire','Clare','Clara','Chiara','Ciara']
comparenames('F', claires)
allyears_indexed.loc[('F',claires),:]
number | |||
---|---|---|---|
sex | name | year | |
F | Claire | 1880 | 21 |
1881 | 23 | ||
1882 | 30 | ||
... | ... | ... | |
Ciara | 2016 | 321 | |
2017 | 243 | ||
2018 | 256 |
522 rows × 1 columns
pp.figure(figsize=(12,2.5))
pp.stackplot(range(1880,2019), allyears_indexed.loc[('F',claires),:].unstack(level=2));
pp.figure(figsize=(12,2.5))
pp.stackplot(range(1880,2019), allyears_indexed.loc[('F',claires),:].unstack(level=2).fillna(0),labels=claires);
pp.legend(loc='upper left')
pp.axis(xmin=1880, xmax=2018);
pd.options.display.max_rows = 10
allyears = pd.read_csv('allyears.csv.zip')
allyears_byyear = allyears.set_index(['sex','year']).sort_index()
allyears_byyear.loc['M',2018].sort_values('number', ascending=False)
name | number | ||
---|---|---|---|
sex | year | ||
M | 2018 | Liam | 19837 |
2018 | Noah | 18267 | |
2018 | William | 14516 | |
2018 | James | 13525 | |
2018 | Oliver | 13389 | |
... | ... | ... | |
2018 | Gaylon | 5 | |
2018 | Gavynn | 5 | |
2018 | Gavon | 5 | |
2018 | Gaurav | 5 | |
2018 | Zzyzx | 5 |
14004 rows × 2 columns
allyears_byyear.loc['M',2018].sort_values('number', ascending=False).head(10)
name | number | ||
---|---|---|---|
sex | year | ||
M | 2018 | Liam | 19837 |
2018 | Noah | 18267 | |
2018 | William | 14516 | |
2018 | James | 13525 | |
2018 | Oliver | 13389 | |
2018 | Benjamin | 13381 | |
2018 | Elijah | 12886 | |
2018 | Lucas | 12585 | |
2018 | Mason | 12435 | |
2018 | Logan | 12352 |
allyears_byyear.loc['F',2018].sort_values('number', ascending=False)
name | number | ||
---|---|---|---|
sex | year | ||
F | 2018 | Emma | 18688 |
2018 | Olivia | 17921 | |
2018 | Ava | 14924 | |
2018 | Isabella | 14464 | |
2018 | Sophia | 13928 | |
... | ... | ... | |
2018 | Ghala | 5 | |
2018 | Ghalia | 5 | |
2018 | Ghislaine | 5 | |
2018 | Giahna | 5 | |
2018 | Zyona | 5 |
18029 rows × 2 columns
allyears_byyear.loc['F',2018].sort_values('number', ascending=False).head(10).reset_index().name
0 Emma 1 Olivia 2 Ava 3 Isabella 4 Sophia 5 Charlotte 6 Mia 7 Amelia 8 Harper 9 Evelyn Name: name, dtype: object
def getyear(sex, year):
return (allyears_byyear.loc[sex, year]
.sort_values('number', ascending=False)
.head(10)
.reset_index()
.name)
pd.DataFrame({year: getyear('M',year) for year in range(2010,2019)})
2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | |
---|---|---|---|---|---|---|---|---|---|
0 | Jacob | Jacob | Jacob | Noah | Noah | Noah | Noah | Liam | Liam |
1 | Ethan | Mason | Mason | Jacob | Liam | Liam | Liam | Noah | Noah |
2 | Michael | William | Ethan | Liam | Mason | Mason | William | William | William |
3 | Jayden | Jayden | Noah | Mason | Jacob | Jacob | Mason | James | James |
4 | William | Noah | William | William | William | William | James | Logan | Oliver |
5 | Alexander | Michael | Liam | Ethan | Ethan | Ethan | Benjamin | Benjamin | Benjamin |
6 | Noah | Ethan | Michael | Michael | Michael | James | Jacob | Mason | Elijah |
7 | Daniel | Alexander | Jayden | Alexander | Alexander | Alexander | Michael | Elijah | Lucas |
8 | Aiden | Aiden | Alexander | Jayden | James | Michael | Elijah | Oliver | Mason |
9 | Anthony | Daniel | Aiden | Daniel | Daniel | Benjamin | Ethan | Jacob | Logan |
pd.DataFrame({year: getyear('F',year) for year in range(2010,2019)})
2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | |
---|---|---|---|---|---|---|---|---|---|
0 | Isabella | Sophia | Sophia | Sophia | Emma | Emma | Emma | Emma | Emma |
1 | Sophia | Isabella | Emma | Emma | Olivia | Olivia | Olivia | Olivia | Olivia |
2 | Emma | Emma | Isabella | Olivia | Sophia | Sophia | Ava | Ava | Ava |
3 | Olivia | Olivia | Olivia | Isabella | Isabella | Ava | Sophia | Isabella | Isabella |
4 | Ava | Ava | Ava | Ava | Ava | Isabella | Isabella | Sophia | Sophia |
5 | Emily | Emily | Emily | Mia | Mia | Mia | Mia | Mia | Charlotte |
6 | Abigail | Abigail | Abigail | Emily | Emily | Abigail | Charlotte | Charlotte | Mia |
7 | Madison | Madison | Mia | Abigail | Abigail | Emily | Abigail | Amelia | Amelia |
8 | Chloe | Mia | Madison | Madison | Madison | Charlotte | Emily | Evelyn | Harper |
9 | Mia | Chloe | Elizabeth | Elizabeth | Charlotte | Harper | Harper | Abigail | Evelyn |
def plotname(sex, name):
data = allyears.query('sex ==@sex and name == @name')
pp.plot(data.year, data.number, label=name)
pp.axis(xmin=1880, xmax=2018)
pp.figure(figsize=(12,2.5))
for name in getyear('F',2018):
plotname('F', name)
pp.legend()
pp.figure(figsize=(12,2.5))
for name in getyear('M',2018):
plotname('M', name)
pp.legend()
list(getyear('M',2018))
['Liam', 'Noah', 'William', 'James', 'Oliver', 'Benjamin', 'Elijah', 'Lucas', 'Mason', 'Logan']
pp.figure(figsize=(12,2.5))
for name in ['Liam','Noah','Oliver','Benjamin','Elijah','Lucas','Mason','Logan']:
plotname('M', name)
pp.legend()
alltime_f = allyears_byyear.loc['F'].groupby('name').sum().sort_values('number',ascending=False).head(10)
alltime_f
number | |
---|---|
name | |
Mary | 4125675 |
Elizabeth | 1638349 |
Patricia | 1572016 |
Jennifer | 1467207 |
Linda | 1452668 |
Barbara | 1434397 |
Margaret | 1248985 |
Susan | 1121703 |
Dorothy | 1107635 |
Sarah | 1077746 |
pp.figure(figsize=(12,2.5))
for name in alltime_f.index:
plotname('F', name)
pp.legend()
pd.options.display.max_rows = 10
allyears = pd.read_csv('allyears.csv.zip')
totals = allyears.groupby(['sex','name']).number.sum()
totals
sex name F Aabha 35 Aabidah 5 Aabriella 38 Aada 13 Aadaya 8 .. M Zyus 11 Zyvion 5 Zyvon 7 Zyyon 6 Zzyzx 10 Name: number, Length: 109173, dtype: int64
male,female = totals.loc['M'],totals.loc['F']
male
name Aaban 114 Aabid 16 Aabir 10 Aadam 273 Aadan 130 ... Zyus 11 Zyvion 5 Zyvon 7 Zyyon 6 Zzyzx 10 Name: number, Length: 41475, dtype: int64
female
name Aabha 35 Aabidah 5 Aabriella 38 Aada 13 Aadaya 8 .. Zyrielle 27 Zyrihanna 45 Zyriyah 16 Zyva 23 Zyyanna 6 Name: number, Length: 67698, dtype: int64
totals.loc['M'] / totals.loc['F'] < 2
name Aaban False Aabha False Aabid False Aabidah False Aabir False ... Zyvion False Zyvon False Zyyanna False Zyyon False Zzyzx False Name: number, Length: 98400, dtype: bool
ratios = (totals.loc['M'] / totals.loc['F']).dropna()
ratios[(ratios > 0.5) & (ratios < 2)]
name Aalijah 1.422819 Aamari 1.430233 Aari 1.111111 Aarin 1.950331 Aaris 1.000000 ... Ziyan 1.251748 Zoel 1.928571 Zohar 0.520270 Zyian 0.714286 Zyrie 0.842105 Name: number, Length: 1660, dtype: float64
unisex = ratios[(ratios > 0.5) & (ratios < 2)].index
unisex
Index(['Aalijah', 'Aamari', 'Aari', 'Aarin', 'Aaris', 'Aaryn', 'Aavyn', 'Abey', 'Abrar', 'Abriel', ... 'Zell', 'Zi', 'Ziel', 'Zihan', 'Zixuan', 'Ziyan', 'Zoel', 'Zohar', 'Zyian', 'Zyrie'], dtype='object', name='name', length=1660)
common = (male.loc[unisex] + female.loc[unisex]).sort_values(ascending=False).head(10)
common
name Jessie 277674 Riley 201179 Casey 186947 Jackie 169199 Peyton 120657 Jaime 117885 Kerry 98195 Kendall 93317 Jody 86971 Frankie 74325 Name: number, dtype: int64
allyears_indexed = allyears.set_index(['sex','name','year']).sort_index()
pp.figure(figsize=(9,9))
for i, name in enumerate(common.index):
pp.subplot(5,2,i+1)
pp.plot(allyears_indexed.loc['M',name],label='M')
pp.plot(allyears_indexed.loc['F',name],label='F')
pp.legend()
pp.title(name)
pp.tight_layout()