import math
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as pp
%matplotlib inline
pd.options.display.max_rows = 8
open('names/yob2011.txt','r').readlines()[:10]
['Sophia,F,21842\n', 'Isabella,F,19910\n', 'Emma,F,18803\n', 'Olivia,F,17322\n', 'Ava,F,15503\n', 'Emily,F,14258\n', 'Abigail,F,13248\n', 'Madison,F,12389\n', 'Mia,F,11530\n', 'Chloe,F,10991\n']
pd.read_csv('names/yob2011.txt')
| Sophia | F | 21842 | |
|---|---|---|---|
| 0 | Isabella | F | 19910 |
| 1 | Emma | F | 18803 |
| 2 | Olivia | F | 17322 |
| 3 | Ava | F | 15503 |
| ... | ... | ... | ... |
| 33903 | Zylas | M | 5 |
| 33904 | Zyran | M | 5 |
| 33905 | Zyshawn | M | 5 |
| 33906 | Zytavion | M | 5 |
33907 rows × 3 columns
pd.read_csv('names/yob2011.txt',names=['name','sex','number'])
| name | sex | number | |
|---|---|---|---|
| 0 | Sophia | F | 21842 |
| 1 | Isabella | F | 19910 |
| 2 | Emma | F | 18803 |
| ... | ... | ... | ... |
| 33905 | Zyran | M | 5 |
| 33906 | Zyshawn | M | 5 |
| 33907 | Zytavion | M | 5 |
33908 rows × 3 columns
pd.read_csv('names/yob2011.txt',names=['name','sex','number']).assign(year=2011)
| name | sex | number | year | |
|---|---|---|---|---|
| 0 | Sophia | F | 21842 | 2011 |
| 1 | Isabella | F | 19910 | 2011 |
| 2 | Emma | F | 18803 | 2011 |
| ... | ... | ... | ... | ... |
| 33905 | Zyran | M | 5 | 2011 |
| 33906 | Zyshawn | M | 5 | 2011 |
| 33907 | Zytavion | M | 5 | 2011 |
33908 rows × 4 columns
allyears = pd.concat(pd.read_csv(f'names/yob{year}.txt',names=['name','sex','number']).assign(year=year)
for year in range(1880, 2019))
allyears.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1957046 entries, 0 to 32032 Data columns (total 4 columns): # Column Dtype --- ------ ----- 0 name object 1 sex object 2 number int64 3 year int64 dtypes: int64(2), object(2) memory usage: 59.7+ MB
allyears.year.min(), allyears.year.max()
(1880, 2018)
allyears.to_csv('allyears.csv.zip',index=False)
pd.options.display.max_rows = 6
allyears = pd.read_csv('allyears.csv.zip')
allyears
| name | sex | number | year | |
|---|---|---|---|---|
| 0 | Mary | F | 7065 | 1880 |
| 1 | Anna | F | 2604 | 1880 |
| 2 | Emma | F | 2003 | 1880 |
| ... | ... | ... | ... | ... |
| 1957043 | Zyrie | M | 5 | 2018 |
| 1957044 | Zyron | M | 5 | 2018 |
| 1957045 | Zzyzx | M | 5 | 2018 |
1957046 rows × 4 columns
allyears_indexed = allyears.set_index(['sex','name','year']).sort_index()
pp.plot(allyears_indexed.loc[('F','Mary')])
[<matplotlib.lines.Line2D at 0xb61fe30>]
pp.plot(allyears_indexed.loc[('F','Mary')] / allyears.groupby('year').sum())
[<matplotlib.lines.Line2D at 0xa843150>]
def plotname(sex, name):
data = allyears_indexed.loc[(sex,name)]
pp.plot(data.index, data.values, label=name)
pp.axis(xmin=1880, xmax=2018)
def comparenames(sex, names):
pp.figure(figsize=(12,2.5))
for name in names:
plotname(sex, name)
pp.legend()
comparenames('M' ,['Michael','John','David','Martin'])
comparenames('F' ,['Emily','Anna','Claire','Elizabeth'])
claires = ['Claire','Clare','Clara','Chiara','Ciara']
comparenames('F', claires)
allyears_indexed.loc[('F',claires),:]
| number | |||
|---|---|---|---|
| sex | name | year | |
| F | Claire | 1880 | 21 |
| 1881 | 23 | ||
| 1882 | 30 | ||
| ... | ... | ... | |
| Ciara | 2016 | 321 | |
| 2017 | 243 | ||
| 2018 | 256 |
522 rows × 1 columns
pp.figure(figsize=(12,2.5))
pp.stackplot(range(1880,2019), allyears_indexed.loc[('F',claires),:].unstack(level=2));
pp.figure(figsize=(12,2.5))
pp.stackplot(range(1880,2019), allyears_indexed.loc[('F',claires),:].unstack(level=2).fillna(0),labels=claires);
pp.legend(loc='upper left')
pp.axis(xmin=1880, xmax=2018);
pd.options.display.max_rows = 10
allyears = pd.read_csv('allyears.csv.zip')
allyears_byyear = allyears.set_index(['sex','year']).sort_index()
allyears_byyear.loc['M',2018].sort_values('number', ascending=False)
| name | number | ||
|---|---|---|---|
| sex | year | ||
| M | 2018 | Liam | 19837 |
| 2018 | Noah | 18267 | |
| 2018 | William | 14516 | |
| 2018 | James | 13525 | |
| 2018 | Oliver | 13389 | |
| ... | ... | ... | |
| 2018 | Gaylon | 5 | |
| 2018 | Gavynn | 5 | |
| 2018 | Gavon | 5 | |
| 2018 | Gaurav | 5 | |
| 2018 | Zzyzx | 5 |
14004 rows × 2 columns
allyears_byyear.loc['M',2018].sort_values('number', ascending=False).head(10)
| name | number | ||
|---|---|---|---|
| sex | year | ||
| M | 2018 | Liam | 19837 |
| 2018 | Noah | 18267 | |
| 2018 | William | 14516 | |
| 2018 | James | 13525 | |
| 2018 | Oliver | 13389 | |
| 2018 | Benjamin | 13381 | |
| 2018 | Elijah | 12886 | |
| 2018 | Lucas | 12585 | |
| 2018 | Mason | 12435 | |
| 2018 | Logan | 12352 |
allyears_byyear.loc['F',2018].sort_values('number', ascending=False)
| name | number | ||
|---|---|---|---|
| sex | year | ||
| F | 2018 | Emma | 18688 |
| 2018 | Olivia | 17921 | |
| 2018 | Ava | 14924 | |
| 2018 | Isabella | 14464 | |
| 2018 | Sophia | 13928 | |
| ... | ... | ... | |
| 2018 | Ghala | 5 | |
| 2018 | Ghalia | 5 | |
| 2018 | Ghislaine | 5 | |
| 2018 | Giahna | 5 | |
| 2018 | Zyona | 5 |
18029 rows × 2 columns
allyears_byyear.loc['F',2018].sort_values('number', ascending=False).head(10).reset_index().name
0 Emma 1 Olivia 2 Ava 3 Isabella 4 Sophia 5 Charlotte 6 Mia 7 Amelia 8 Harper 9 Evelyn Name: name, dtype: object
def getyear(sex, year):
return (allyears_byyear.loc[sex, year]
.sort_values('number', ascending=False)
.head(10)
.reset_index()
.name)
pd.DataFrame({year: getyear('M',year) for year in range(2010,2019)})
| 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Jacob | Jacob | Jacob | Noah | Noah | Noah | Noah | Liam | Liam |
| 1 | Ethan | Mason | Mason | Jacob | Liam | Liam | Liam | Noah | Noah |
| 2 | Michael | William | Ethan | Liam | Mason | Mason | William | William | William |
| 3 | Jayden | Jayden | Noah | Mason | Jacob | Jacob | Mason | James | James |
| 4 | William | Noah | William | William | William | William | James | Logan | Oliver |
| 5 | Alexander | Michael | Liam | Ethan | Ethan | Ethan | Benjamin | Benjamin | Benjamin |
| 6 | Noah | Ethan | Michael | Michael | Michael | James | Jacob | Mason | Elijah |
| 7 | Daniel | Alexander | Jayden | Alexander | Alexander | Alexander | Michael | Elijah | Lucas |
| 8 | Aiden | Aiden | Alexander | Jayden | James | Michael | Elijah | Oliver | Mason |
| 9 | Anthony | Daniel | Aiden | Daniel | Daniel | Benjamin | Ethan | Jacob | Logan |
pd.DataFrame({year: getyear('F',year) for year in range(2010,2019)})
| 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Isabella | Sophia | Sophia | Sophia | Emma | Emma | Emma | Emma | Emma |
| 1 | Sophia | Isabella | Emma | Emma | Olivia | Olivia | Olivia | Olivia | Olivia |
| 2 | Emma | Emma | Isabella | Olivia | Sophia | Sophia | Ava | Ava | Ava |
| 3 | Olivia | Olivia | Olivia | Isabella | Isabella | Ava | Sophia | Isabella | Isabella |
| 4 | Ava | Ava | Ava | Ava | Ava | Isabella | Isabella | Sophia | Sophia |
| 5 | Emily | Emily | Emily | Mia | Mia | Mia | Mia | Mia | Charlotte |
| 6 | Abigail | Abigail | Abigail | Emily | Emily | Abigail | Charlotte | Charlotte | Mia |
| 7 | Madison | Madison | Mia | Abigail | Abigail | Emily | Abigail | Amelia | Amelia |
| 8 | Chloe | Mia | Madison | Madison | Madison | Charlotte | Emily | Evelyn | Harper |
| 9 | Mia | Chloe | Elizabeth | Elizabeth | Charlotte | Harper | Harper | Abigail | Evelyn |
def plotname(sex, name):
data = allyears.query('sex ==@sex and name == @name')
pp.plot(data.year, data.number, label=name)
pp.axis(xmin=1880, xmax=2018)
pp.figure(figsize=(12,2.5))
for name in getyear('F',2018):
plotname('F', name)
pp.legend()
pp.figure(figsize=(12,2.5))
for name in getyear('M',2018):
plotname('M', name)
pp.legend()
list(getyear('M',2018))
['Liam', 'Noah', 'William', 'James', 'Oliver', 'Benjamin', 'Elijah', 'Lucas', 'Mason', 'Logan']
pp.figure(figsize=(12,2.5))
for name in ['Liam','Noah','Oliver','Benjamin','Elijah','Lucas','Mason','Logan']:
plotname('M', name)
pp.legend()
alltime_f = allyears_byyear.loc['F'].groupby('name').sum().sort_values('number',ascending=False).head(10)
alltime_f
| number | |
|---|---|
| name | |
| Mary | 4125675 |
| Elizabeth | 1638349 |
| Patricia | 1572016 |
| Jennifer | 1467207 |
| Linda | 1452668 |
| Barbara | 1434397 |
| Margaret | 1248985 |
| Susan | 1121703 |
| Dorothy | 1107635 |
| Sarah | 1077746 |
pp.figure(figsize=(12,2.5))
for name in alltime_f.index:
plotname('F', name)
pp.legend()
pd.options.display.max_rows = 10
allyears = pd.read_csv('allyears.csv.zip')
totals = allyears.groupby(['sex','name']).number.sum()
totals
sex name
F Aabha 35
Aabidah 5
Aabriella 38
Aada 13
Aadaya 8
..
M Zyus 11
Zyvion 5
Zyvon 7
Zyyon 6
Zzyzx 10
Name: number, Length: 109173, dtype: int64
male,female = totals.loc['M'],totals.loc['F']
male
name
Aaban 114
Aabid 16
Aabir 10
Aadam 273
Aadan 130
...
Zyus 11
Zyvion 5
Zyvon 7
Zyyon 6
Zzyzx 10
Name: number, Length: 41475, dtype: int64
female
name
Aabha 35
Aabidah 5
Aabriella 38
Aada 13
Aadaya 8
..
Zyrielle 27
Zyrihanna 45
Zyriyah 16
Zyva 23
Zyyanna 6
Name: number, Length: 67698, dtype: int64
totals.loc['M'] / totals.loc['F'] < 2
name
Aaban False
Aabha False
Aabid False
Aabidah False
Aabir False
...
Zyvion False
Zyvon False
Zyyanna False
Zyyon False
Zzyzx False
Name: number, Length: 98400, dtype: bool
ratios = (totals.loc['M'] / totals.loc['F']).dropna()
ratios[(ratios > 0.5) & (ratios < 2)]
name
Aalijah 1.422819
Aamari 1.430233
Aari 1.111111
Aarin 1.950331
Aaris 1.000000
...
Ziyan 1.251748
Zoel 1.928571
Zohar 0.520270
Zyian 0.714286
Zyrie 0.842105
Name: number, Length: 1660, dtype: float64
unisex = ratios[(ratios > 0.5) & (ratios < 2)].index
unisex
Index(['Aalijah', 'Aamari', 'Aari', 'Aarin', 'Aaris', 'Aaryn', 'Aavyn', 'Abey',
'Abrar', 'Abriel',
...
'Zell', 'Zi', 'Ziel', 'Zihan', 'Zixuan', 'Ziyan', 'Zoel', 'Zohar',
'Zyian', 'Zyrie'],
dtype='object', name='name', length=1660)
common = (male.loc[unisex] + female.loc[unisex]).sort_values(ascending=False).head(10)
common
name Jessie 277674 Riley 201179 Casey 186947 Jackie 169199 Peyton 120657 Jaime 117885 Kerry 98195 Kendall 93317 Jody 86971 Frankie 74325 Name: number, dtype: int64
allyears_indexed = allyears.set_index(['sex','name','year']).sort_index()
pp.figure(figsize=(9,9))
for i, name in enumerate(common.index):
pp.subplot(5,2,i+1)
pp.plot(allyears_indexed.loc['M',name],label='M')
pp.plot(allyears_indexed.loc['F',name],label='F')
pp.legend()
pp.title(name)
pp.tight_layout()