import math
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as pp
%matplotlib inline
pd.options.display.max_rows = 8
open('names/yob2011.txt','r').readlines()[:10]
['Sophia,F,21842\n', 'Isabella,F,19910\n', 'Emma,F,18803\n', 'Olivia,F,17322\n', 'Ava,F,15503\n', 'Emily,F,14258\n', 'Abigail,F,13248\n', 'Madison,F,12389\n', 'Mia,F,11530\n', 'Chloe,F,10991\n']
pd.read_csv('names/yob2011.txt')
Sophia | F | 21842 | |
---|---|---|---|
0 | Isabella | F | 19910 |
1 | Emma | F | 18803 |
2 | Olivia | F | 17322 |
3 | Ava | F | 15503 |
... | ... | ... | ... |
33903 | Zylas | M | 5 |
33904 | Zyran | M | 5 |
33905 | Zyshawn | M | 5 |
33906 | Zytavion | M | 5 |
33907 rows × 3 columns
pd.read_csv('names/yob2011.txt',names=['name','sex','number'])
name | sex | number | |
---|---|---|---|
0 | Sophia | F | 21842 |
1 | Isabella | F | 19910 |
2 | Emma | F | 18803 |
... | ... | ... | ... |
33905 | Zyran | M | 5 |
33906 | Zyshawn | M | 5 |
33907 | Zytavion | M | 5 |
33908 rows × 3 columns
pd.read_csv('names/yob2011.txt',names=['name','sex','number']).assign(year=2011)
name | sex | number | year | |
---|---|---|---|---|
0 | Sophia | F | 21842 | 2011 |
1 | Isabella | F | 19910 | 2011 |
2 | Emma | F | 18803 | 2011 |
... | ... | ... | ... | ... |
33905 | Zyran | M | 5 | 2011 |
33906 | Zyshawn | M | 5 | 2011 |
33907 | Zytavion | M | 5 | 2011 |
33908 rows × 4 columns
allyears = pd.concat(pd.read_csv(f'names/yob{year}.txt',names=['name','sex','number']).assign(year=year)
for year in range(1880, 2019))
allyears.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1957046 entries, 0 to 32032 Data columns (total 4 columns): # Column Dtype --- ------ ----- 0 name object 1 sex object 2 number int64 3 year int64 dtypes: int64(2), object(2) memory usage: 59.7+ MB
allyears.year.min(), allyears.year.max()
(1880, 2018)
allyears.to_csv('allyears.csv.zip',index=False)
pd.options.display.max_rows = 6
allyears = pd.read_csv('allyears.csv.zip')
allyears
name | sex | number | year | |
---|---|---|---|---|
0 | Mary | F | 7065 | 1880 |
1 | Anna | F | 2604 | 1880 |
2 | Emma | F | 2003 | 1880 |
... | ... | ... | ... | ... |
1957043 | Zyrie | M | 5 | 2018 |
1957044 | Zyron | M | 5 | 2018 |
1957045 | Zzyzx | M | 5 | 2018 |
1957046 rows × 4 columns
allyears_indexed = allyears.set_index(['sex','name','year']).sort_index()
pp.plot(allyears_indexed.loc[('F','Mary')])
[<matplotlib.lines.Line2D at 0xb61fe30>]
pp.plot(allyears_indexed.loc[('F','Mary')] / allyears.groupby('year').sum())
[<matplotlib.lines.Line2D at 0xa843150>]
def plotname(sex, name):
data = allyears_indexed.loc[(sex,name)]
pp.plot(data.index, data.values, label=name)
pp.axis(xmin=1880, xmax=2018)
def comparenames(sex, names):
pp.figure(figsize=(12,2.5))
for name in names:
plotname(sex, name)
pp.legend()
comparenames('M' ,['Michael','John','David','Martin'])
comparenames('F' ,['Emily','Anna','Claire','Elizabeth'])
claires = ['Claire','Clare','Clara','Chiara','Ciara']
comparenames('F', claires)
allyears_indexed.loc[('F',claires),:]
number | |||
---|---|---|---|
sex | name | year | |
F | Claire | 1880 | 21 |
1881 | 23 | ||
1882 | 30 | ||
... | ... | ... | |
Ciara | 2016 | 321 | |
2017 | 243 | ||
2018 | 256 |
522 rows × 1 columns
pp.figure(figsize=(12,2.5))
pp.stackplot(range(1880,2019), allyears_indexed.loc[('F',claires),:].unstack(level=2));
pp.figure(figsize=(12,2.5))
pp.stackplot(range(1880,2019), allyears_indexed.loc[('F',claires),:].unstack(level=2).fillna(0),labels=claires);
pp.legend(loc='upper left')
pp.axis(xmin=1880, xmax=2018);
pd.options.display.max_rows = 10
allyears = pd.read_csv('allyears.csv.zip')
allyears_byyear = allyears.set_index(['sex','year']).sort_index()
allyears_byyear.loc['M',2018].sort_values('number', ascending=False)
name | number | ||
---|---|---|---|
sex | year | ||
M | 2018 | Liam | 19837 |
2018 | Noah | 18267 | |
2018 | William | 14516 | |
2018 | James | 13525 | |
2018 | Oliver | 13389 | |
... | ... | ... | |
2018 | Gaylon | 5 | |
2018 | Gavynn | 5 | |
2018 | Gavon | 5 | |
2018 | Gaurav | 5 | |
2018 | Zzyzx | 5 |
14004 rows × 2 columns
allyears_byyear.loc['M',2018].sort_values('number', ascending=False).head(10)
name | number | ||
---|---|---|---|
sex | year | ||
M | 2018 | Liam | 19837 |
2018 | Noah | 18267 | |
2018 | William | 14516 | |
2018 | James | 13525 | |
2018 | Oliver | 13389 | |
2018 | Benjamin | 13381 | |
2018 | Elijah | 12886 | |
2018 | Lucas | 12585 | |
2018 | Mason | 12435 | |
2018 | Logan | 12352 |
allyears_byyear.loc['F',2018].sort_values('number', ascending=False)
name | number | ||
---|---|---|---|
sex | year | ||
F | 2018 | Emma | 18688 |
2018 | Olivia | 17921 | |
2018 | Ava | 14924 | |
2018 | Isabella | 14464 | |
2018 | Sophia | 13928 | |
... | ... | ... | |
2018 | Ghala | 5 | |
2018 | Ghalia | 5 | |
2018 | Ghislaine | 5 | |
2018 | Giahna | 5 | |
2018 | Zyona | 5 |
18029 rows × 2 columns
allyears_byyear.loc['F',2018].sort_values('number', ascending=False).head(10).reset_index().name
0 Emma 1 Olivia 2 Ava 3 Isabella 4 Sophia 5 Charlotte 6 Mia 7 Amelia 8 Harper 9 Evelyn Name: name, dtype: object
def getyear(sex, year):
return (allyears_byyear.loc[sex, year]
.sort_values('number', ascending=False)
.head(10)
.reset_index()
.name)
pd.DataFrame({year: getyear('M',year) for year in range(2010,2019)})
2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | |
---|---|---|---|---|---|---|---|---|---|
0 | Jacob | Jacob | Jacob | Noah | Noah | Noah | Noah | Liam | Liam |
1 | Ethan | Mason | Mason | Jacob | Liam | Liam | Liam | Noah | Noah |
2 | Michael | William | Ethan | Liam | Mason | Mason | William | William | William |
3 | Jayden | Jayden | Noah | Mason | Jacob | Jacob | Mason | James | James |
4 | William | Noah | William | William | William | William | James | Logan | Oliver |
5 | Alexander | Michael | Liam | Ethan | Ethan | Ethan | Benjamin | Benjamin | Benjamin |
6 | Noah | Ethan | Michael | Michael | Michael | James | Jacob | Mason | Elijah |
7 | Daniel | Alexander | Jayden | Alexander | Alexander | Alexander | Michael | Elijah | Lucas |
8 | Aiden | Aiden | Alexander | Jayden | James | Michael | Elijah | Oliver | Mason |
9 | Anthony | Daniel | Aiden | Daniel | Daniel | Benjamin | Ethan | Jacob | Logan |
pd.DataFrame({year: getyear('F',year) for year in range(2010,2019)})
2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | |
---|---|---|---|---|---|---|---|---|---|
0 | Isabella | Sophia | Sophia | Sophia | Emma | Emma | Emma | Emma | Emma |
1 | Sophia | Isabella | Emma | Emma | Olivia | Olivia | Olivia | Olivia | Olivia |
2 | Emma | Emma | Isabella | Olivia | Sophia | Sophia | Ava | Ava | Ava |
3 | Olivia | Olivia | Olivia | Isabella | Isabella | Ava | Sophia | Isabella | Isabella |
4 | Ava | Ava | Ava | Ava | Ava | Isabella | Isabella | Sophia | Sophia |
5 | Emily | Emily | Emily | Mia | Mia | Mia | Mia | Mia | Charlotte |
6 | Abigail | Abigail | Abigail | Emily | Emily | Abigail | Charlotte | Charlotte | Mia |
7 | Madison | Madison | Mia | Abigail | Abigail | Emily | Abigail | Amelia | Amelia |
8 | Chloe | Mia | Madison | Madison | Madison | Charlotte | Emily | Evelyn | Harper |
9 | Mia | Chloe | Elizabeth | Elizabeth | Charlotte | Harper | Harper | Abigail | Evelyn |
def plotname(sex, name):
data = allyears.query('sex ==@sex and name == @name')
pp.plot(data.year, data.number, label=name)
pp.axis(xmin=1880, xmax=2018)
pp.figure(figsize=(12,2.5))
for name in getyear('F',2018):
plotname('F', name)
pp.legend()