Messing with data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
In [2]:
from matplotlib import pyplot as plt
from scipy import stats
In [3]:
data = pd.read_csv('clean_murder_data.csv')
stateAbbr = pd.read_csv("StatesAbbr.csv")

data.head()
C:\Users\Zachk\Anaconda3\envs\PythonData\lib\site-packages\IPython\core\interactiveshell.py:2728: DtypeWarning: Columns (16,20) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[3]:
Unnamed: 0 CaseID CNTYFIPS Ori State Agency Agentype Source Solved Year ... Weapon Relationship Circumstance Subcircum VicCount OffCount FileDate fstate MSA StateAbbr
0 0 197601001AKASP00 Juneau, AK AL00102 Alabama Birmingham Municipal police FBI Yes 1976 ... Shotgun Acquaintance Felon killed by private citizen Felon killed in commission of a crime 0 0 30180 Alabama Birmingham-Hoover, AL AL
1 1 197601001AL00102 Jefferson, AL AL00104 Alabama Fairfield Municipal police FBI Yes 1976 ... Shotgun Wife Other NaN 0 0 30180 Alabama Birmingham-Hoover, AL AL
2 2 197601001AL00104 Jefferson, AL AL00106 Alabama Leeds Municipal police FBI Yes 1976 ... Knife or cutting instrument Brother Other arguments NaN 0 0 30180 Alabama Birmingham-Hoover, AL AL
3 3 197601001AL00106 Jefferson, AL AL00201 Alabama Mobile Municipal police FBI Yes 1976 ... Strangulation - hanging Acquaintance Circumstances undetermined NaN 0 0 30180 Alabama Mobile, AL AL
4 4 197601001AL00201 Mobile, AL AL00202 Alabama Prichard Municipal police FBI Yes 1976 ... Rifle Friend Other arguments NaN 0 2 30180 Alabama Mobile, AL AL

5 rows × 34 columns

In [4]:
print(data.columns)
print(stateAbbr.columns)
relationship_df = data.groupby(['Relationship']).CaseID.count().reset_index()
relationship_df
Index(['Unnamed: 0', 'CaseID', 'CNTYFIPS', 'Ori', 'State', 'Agency',
       'Agentype', 'Source', 'Solved', 'Year', 'StateName', 'Month',
       'Incident', 'ActionType', 'Homicide', 'Situation', 'VicAge', 'VicSex',
       'VicRace', 'VicEthnic', 'OffAge', 'OffSex', 'OffRace', 'OffEthnic',
       'Weapon', 'Relationship', 'Circumstance', 'Subcircum', 'VicCount',
       'OffCount', 'FileDate', 'fstate', 'MSA', 'StateAbbr'],
      dtype='object')
Index(['State', 'Abbreviation'], dtype='object')
Out[4]:
Relationship CaseID
0 Acquaintance 150341
1 Boyfriend 8687
2 Brother 6633
3 Common-law husband 2684
4 Common-law wife 3220
5 Daughter 8679
6 Employee 441
7 Employer 573
8 Ex-husband 865
9 Ex-wife 2330
10 Father 5380
11 Friend 26196
12 Girlfriend 18876
13 Homosexual relationship 1636
14 Husband 11797
15 In-law 4728
16 Mother 5044
17 Neighbor 7888
18 Other - known to victim 33464
19 Other family 11186
20 Relationship not determined 283106
21 Sister 1564
22 Son 11661
23 Stepdaughter 860
24 Stepfather 1686
25 Stepmother 261
26 Stepson 1438
27 Stranger 112668
28 Wife 27893

Creating State database

In [5]:
# Total Murders by state since 1976
totalMurders = data.groupby('State')['CaseID'].count()
print(totalMurders.head())
totalMurders_df = totalMurders.reset_index().rename(columns={'CaseID':'TotalMurders'})
print(totalMurders_df.head())
State
Alabama        15687
Alaska          1952
Arizona        14482
Arkansas        8081
California    114621
Name: CaseID, dtype: int64
        State  TotalMurders
0     Alabama         15687
1      Alaska          1952
2     Arizona         14482
3    Arkansas          8081
4  California        114621
In [6]:
# Number of UNSOLVED homicides by state since 1976
unsolved = data.loc[data['Solved'] == 'No']
unsolved = unsolved.groupby('State')['CaseID'].count()
print(unsolved.head())
unsolved_df = unsolved.reset_index().rename(columns={'CaseID':'TotalUnsolved'})
print(unsolved_df.head())
State
Alabama        3350
Alaska          362
Arizona        4065
Arkansas       1217
California    41459
Name: CaseID, dtype: int64
        State  TotalUnsolved
0     Alabama           3350
1      Alaska            362
2     Arizona           4065
3    Arkansas           1217
4  California          41459
In [7]:
# Percent of UNSOLVED homicides by state since 1976
percentUnsolved = (unsolved/totalMurders)*100
percentUnsolved = percentUnsolved.round(1)
print(percentUnsolved.round(1).head())
percentUnsolved_df = pd.DataFrame(percentUnsolved).reset_index().rename(columns={'CaseID':'PercentUnsolved'})
print(percentUnsolved_df.head())
State
Alabama       21.4
Alaska        18.5
Arizona       28.1
Arkansas      15.1
California    36.2
Name: CaseID, dtype: float64
        State  PercentUnsolved
0     Alabama             21.4
1      Alaska             18.5
2     Arizona             28.1
3    Arkansas             15.1
4  California             36.2
In [8]:
# Number of SOLVED homicides by state since 1976
solved = data.loc[data['Solved'] == 'Yes']
solved = solved.groupby('State')['CaseID'].count()
print(solved.head())
solved_df = solved.reset_index().rename(columns={'CaseID':'TotalSolved'})
print(solved_df.head())
State
Alabama       12337
Alaska         1590
Arizona       10417
Arkansas       6864
California    73162
Name: CaseID, dtype: int64
        State  TotalSolved
0     Alabama        12337
1      Alaska         1590
2     Arizona        10417
3    Arkansas         6864
4  California        73162
In [9]:
# Percent of SOLVED homicides by state since 1976
percentSolved = (solved/totalMurders)*100
percentSolved = percentSolved.round(1)
print(percentSolved.head())
percentSolved_df = pd.DataFrame(percentSolved).reset_index().rename(columns={'CaseID':'PercentSolved'})
print(percentSolved_df.head())
State
Alabama       78.6
Alaska        81.5
Arizona       71.9
Arkansas      84.9
California    63.8
Name: CaseID, dtype: float64
        State  PercentSolved
0     Alabama           78.6
1      Alaska           81.5
2     Arizona           71.9
3    Arkansas           84.9
4  California           63.8
In [10]:
# stateWeapon= data.groupby(['State','Weapon'])["CaseID"].count().reset_index()
# print(stateWeapon.head())
# stateWeapon = stateWeapon.sort_values(["CaseID"], ascending=False)
# stateWeapon_df = stateWeapon.drop_duplicates(["State"])
# print(stateWeapon_df.head())
In [11]:
# Merge all State data to create final csv
state_db = stateAbbr.merge(totalMurders_df,on="State")
state_db = state_db.merge(unsolved_df, on="State")
state_db = state_db.merge(percentUnsolved_df, on="State")
state_db = state_db.merge(solved_df, on="State")
state_db = state_db.merge(percentSolved_df, on="State")
print(state_db.head())
state_db.to_csv("state_db.csv", index=False)
        State Abbreviation  TotalMurders  TotalUnsolved  PercentUnsolved  \
0     Alabama           AL         15687           3350             21.4   
1      Alaska           AK          1952            362             18.5   
2     Arizona           AZ         14482           4065             28.1   
3    Arkansas           AR          8081           1217             15.1   
4  California           CA        114621          41459             36.2   

   TotalSolved  PercentSolved  
0        12337           78.6  
1         1590           81.5  
2        10417           71.9  
3         6864           84.9  
4        73162           63.8  

Testing for Murder data by month

In [12]:
def month_string_to_number(string):
    m = {
        'jan': 1,
        'feb': 2,
        'mar': 3,
        'apr':4,
         'may':5,
         'jun':6,
         'jul':7,
         'aug':8,
         'sep':9,
         'oct':10,
         'nov':11,
         'dec':12
        }
    s = string.strip()[:3].lower()

    try:
        out = m[s]
        return out
    except:
        raise ValueError('Not a month')
In [13]:
stateMurdersByMonth = data.groupby(['State','Month'])['CaseID'].count().reset_index()
#print(stateMurdersByMonth.head())
stateMurdersByMonth['MonthNums']=stateMurdersByMonth['Month'].map(month_string_to_number)
stateMurdersByMonth = stateMurdersByMonth.rename(columns={'CaseID':'TotalMurders'})
stateMurdersByMonth = stateMurdersByMonth.sort_values(['State',"MonthNums"], ascending=True)

plotala=stateMurdersByMonth.loc[stateMurdersByMonth['State']=='North Carolina']
print(plotala)
plt.errorbar(plotala.MonthNums, plotala['TotalMurders'], 
                 linestyle='--', marker='.', markersize=8)
              State      Month  TotalMurders  MonthNums
400  North Carolina    January          2029          1
399  North Carolina   February          1757          2
403  North Carolina      March          1921          3
396  North Carolina      April          1962          4
404  North Carolina        May          2002          5
402  North Carolina       June          2019          6
401  North Carolina       July          2183          7
397  North Carolina     August          2164          8
407  North Carolina  September          1967          9
406  North Carolina    October          1994         10
405  North Carolina   November          1936         11
398  North Carolina   December          2024         12
Out[13]:
<Container object of 3 artists>
In [14]:
pivot = stateMurdersByMonth.pivot(index='MonthNums', columns='State',values='TotalMurders')
print(pivot.head())
pivot = pivot.reset_index()
for state in stateAbbr['State']:
    plt.errorbar(pivot.MonthNums, pivot[state], fmt='2', 
                 linestyle='--', marker='.', markersize=8,capsize=4)
plt.show()
State      Alabama  Alaska  Arizona  Arkansas  California  Colorado  \
MonthNums                                                             
1             1314     154     1188       704        9513       629   
2             1120     148     1068       572        8233       545   
3             1205     193     1240       646        9005       589   
4             1243     135     1191       642        9105       622   
5             1245     173     1195       684        9623       595   

State      Connecticut  Delaware  District of Columbia  Florida   ...     \
MonthNums                                                         ...      
1                  444       121                   711     3722   ...      
2                  358       123                   604     3346   ...      
3                  383       122                   641     3665   ...      
4                  406       112                   571     3483   ...      
5                  443       117                   707     3732   ...      

State      South Dakota  Tennessee  Texas  Utah  Vermont  Virginia  \
MonthNums                                                            
1                    45       1362   5757   180       20      1581   
2                    45       1245   5291   176       33      1309   
3                    32       1437   5847   164       38      1463   
4                    51       1410   5883   195       37      1539   
5                    47       1467   6226   206       35      1543   

State      Washington  West Virginia  Wisconsin  Wyoming  
MonthNums                                                 
1                 717            290        561       55  
2                 682            289        511       44  
3                 792            293        525       61  
4                 701            281        590       61  
5                 669            314        629       60  

[5 rows x 51 columns]
In [15]:
relationship_df = data.groupby(['Relationship']).CaseID.count().reset_index()
relationship_df
Out[15]:
Relationship CaseID
0 Acquaintance 150341
1 Boyfriend 8687
2 Brother 6633
3 Common-law husband 2684
4 Common-law wife 3220
5 Daughter 8679
6 Employee 441
7 Employer 573
8 Ex-husband 865
9 Ex-wife 2330
10 Father 5380
11 Friend 26196
12 Girlfriend 18876
13 Homosexual relationship 1636
14 Husband 11797
15 In-law 4728
16 Mother 5044
17 Neighbor 7888
18 Other - known to victim 33464
19 Other family 11186
20 Relationship not determined 283106
21 Sister 1564
22 Son 11661
23 Stepdaughter 860
24 Stepfather 1686
25 Stepmother 261
26 Stepson 1438
27 Stranger 112668
28 Wife 27893
In [16]:
weapon_df = data.groupby(['Weapon']).CaseID.count().reset_index()
weapon_df
Out[16]:
Weapon CaseID
0 Asphyxiation - includes death by gas 4597
1 Blunt object - hammer, club, etc 34176
2 Drowning 1461
3 Explosives 602
4 Fire 7453
5 Firearm, type not stated 57238
6 Handgun - pistol, revolver, etc 370146
7 Knife or cutting instrument 112692
8 Narcotics or drugs, sleeping pills 1979
9 Other gun 2680
10 Other or type unknown 37552
11 Personal weapons, includes beating 43819
12 Poison - does not include gas 521
13 Pushed or thrown out window 247
14 Rifle 28626
15 Shotgun 38196
16 Strangulation - hanging 9800
In [17]:
circumstance_df = data.groupby(['Circumstance']).CaseID.count().reset_index()
circumstance_df
Out[17]:
Circumstance CaseID
0 Abortion 10
1 All other manslaughter by negligence 4560
2 All suspected felony type 13108
3 Argument over money or property 15152
4 Arson 4547
5 Brawl due to influence of alcohol 16004
6 Brawl due to influence of narcotics 5014
7 Burglary 6306
8 Child killed by babysitter 1241
9 Children playing with gun 1372
10 Circumstances undetermined 194564
11 Felon killed by police 16037
12 Felon killed by private citizen 12251
13 Gambling 1065
14 Gangland killings 4451
15 Gun-cleaning death - other than self 144
16 Institutional killings 1064
17 Juvenile gang killings 23652
18 Larceny 738
19 Lovers triangle 11895
20 Motor vehicle theft 1253
21 Narcotic drug laws 28205
22 Other 97130
23 Other - not specified 24112
24 Other arguments 198205
25 Other negligent handling of gun 3550
26 Other sex offense 1441
27 Prostitution and commercialized vice 598
28 Rape 4173
29 Robbery 58986
30 Sniper attack 548
31 Victim shot in hunting accident 409
In [18]:
year_df = data.groupby(['Year']).CaseID.count().reset_index()
year_df
Out[18]:
Year CaseID
0 1976 17619
1 1977 18844
2 1978 19523
3 1979 21698
4 1980 23092
5 1981 21208
6 1982 20544
7 1983 19653
8 1984 18093
9 1985 18386
10 1986 20125
11 1987 18783
12 1988 18779
13 1989 19868
14 1990 21246
15 1991 22657
16 1992 23793
17 1993 24336
18 1994 23246
19 1995 21179
20 1996 18996
21 1997 17830
22 1998 16089
23 1999 14644
24 2000 14692
25 2001 15821
26 2002 16287
27 2003 16532
28 2004 16246
29 2005 16852
30 2006 17294
31 2007 17327
32 2008 15610
33 2009 15857
34 2010 15133
35 2011 15119
36 2012 15390
37 2013 14841
38 2014 14702
39 2015 16064
40 2016 17787
In [19]:
NC_df = data.loc[data["StateAbbr"]=="NC"]
agency_df = NC_df.groupby(['Agency']).CaseID.count().reset_index()
agency_df=agency_df.rename(columns={'CaseID':'Murders'})
agency_sort=agency_df.sort_values(['Murders'], ascending=False)
agency=agency_sort.head(10)
agency
Out[19]:
Agency Murders
65 Charlotte-Mecklenburg 2869
145 Greensboro 959
103 Durham 950
397 Winston-Salem 823
289 Raleigh 705
121 Fayetteville 700
89 Cumberland County 588
304 Robeson County 527
161 High Point 363
390 Wilmington 351
In [32]:
agency.reset_index(drop=True,inplace=True)
labels=agency['Agency']
colors=['red','tan','darkred','grey']
agency.plot.pie('Murders',labels=labels,legend=False,autopct="%0.0f%%",colors=colors,fontsize=12)
plt.axis('equal')
plt.title("North Carolinas Most Murderous Cities",fontsize=15)
plt.ylabel('')
fig = plt.gcf()
fig.set_size_inches(10,10)
plt.tight_layout()
plt.savefig('ncmurder.png', bbox_inches='tight', transparent=True)
plt.show()
In [21]:
MSA_df = NC_df.groupby(['MSA']).CaseID.count().reset_index()
MSA_df.sort_values(['CaseID'], ascending=False)
Out[21]:
MSA CaseID
12 Rural North Carolina 7543
2 Charlotte-Gastonia-Concord, NC-SC 4355
6 Greensboro-High Point, NC 1970
10 Raleigh-Cary, NC 1581
3 Durham, NC 1493
4 Fayetteville, NC 1469
15 Winston-Salem, NC 1127
8 Hickory-Lenoir-Morganton, NC 875
0 Asheville, NC 691
11 Rocky Mount, NC 652
14 Wilmington, NC 636
5 Goldsboro, NC 479
7 Greenville, NC 439
1 Burlington, NC 334
9 Jacksonville, NC 286
13 Virginia Beach-Norfolk-Newport News, VA-NC 28
In [22]:
rel_state = data.groupby(['State','Relationship'])["CaseID"].count().reset_index()
relationship2=rel_state.sort_values(["CaseID","Relationship"], ascending=False)
relbystate=relationship2.drop_duplicates(["State"])
relbystate.head(5)
Out[22]:
State Relationship CaseID
136 California Relationship not determined 39349
931 New York Relationship not determined 34271
275 Florida Relationship not determined 28105
1237 Texas Relationship not determined 22396
648 Michigan Relationship not determined 14674
In [23]:
off_state = data.groupby(['State','OffRace'])["CaseID"].count().reset_index()
off2=off_state.sort_values(["CaseID","OffRace"], ascending=False)
offracebystate=off2.drop_duplicates(["State"])
offracebystate.head(51)
Out[23]:
State OffRace CaseID
24 California White 46773
218 Texas White 34012
163 New York Unknown 27423
49 Florida White 15951
112 Michigan Black 15741
67 Illinois Black 14029
191 Pennsylvania Black 12644
52 Georgia Black 12624
92 Louisiana Black 12252
167 North Carolina Black 11259
176 Ohio Black 9887
211 Tennessee Black 8695
230 Virginia Black 8644
102 Maryland Black 8439
14 Arizona White 8408
2 Alabama Black 8164
201 South Carolina Black 7999
128 Missouri Unknown 6452
152 New Jersey Black 6086
183 Oklahoma White 5428
43 District of Columbia Unknown 5202
122 Mississippi Black 5045
237 Washington White 4979
89 Kentucky White 4685
29 Colorado White 4632
72 Indiana Black 4632
17 Arkansas Black 3850
188 Oregon White 3270
159 New Mexico White 3027
245 Wisconsin Black 3003
144 Nevada White 2967
242 West Virginia White 2759
109 Massachusetts White 2753
34 Connecticut White 2004
119 Minnesota White 1906
84 Kansas White 1850
223 Utah White 1658
79 Iowa White 1361
64 Idaho White 1204
139 Nebraska White 909
99 Maine White 858
9 Alaska White 825
37 Delaware Black 671
198 Rhode Island White 644
252 Wyoming White 632
149 New Hampshire White 606
56 Hawaii Asian or Pacific Islander 601
134 Montana White 597
227 Vermont White 381
208 South Dakota White 337
173 North Dakota White 265
In [24]:
vic_state = data.groupby(['State','VicRace'])["CaseID"].count().reset_index()
vic2=vic_state.sort_values(["CaseID","VicRace"], ascending=False)
vicracebystate=vic2.drop_duplicates(["State"])
vicracebystate.head(51)
Out[24]:
State VicRace CaseID
24 California White 74483
217 Texas White 47024
160 New York Black 29621
48 Florida White 23526
111 Michigan Black 23125
66 Illinois Black 21950
91 Louisiana Black 17064
190 Pennsylvania Black 16656
51 Georgia Black 16614
101 Maryland Black 14709
175 Ohio Black 13354
165 North Carolina Black 12992
14 Arizona White 11925
126 Missouri Black 10980
229 Virginia Black 10546
2 Alabama Black 10115
210 Tennessee Black 9678
150 New Jersey Black 9420
200 South Carolina Black 8166
41 District of Columbia Black 7509
71 Indiana Black 7104
182 Oklahoma White 6614
236 Washington White 6480
29 Colorado White 6080
88 Kentucky White 5998
121 Mississippi Black 5621
143 Nevada White 4392
17 Arkansas Black 4262
157 New Mexico White 4072
187 Oregon White 4012
108 Massachusetts White 4009
246 Wisconsin White 3609
241 West Virginia White 3132
34 Connecticut White 2940
118 Minnesota White 2514
83 Kansas White 2422
222 Utah White 1983
78 Iowa White 1644
63 Idaho White 1316
138 Nebraska White 1077
9 Alaska White 1027
98 Maine White 996
197 Rhode Island White 966
36 Delaware Black 804
55 Hawaii Asian or Pacific Islander 742
147 New Hampshire White 731
251 Wyoming White 710
133 Montana White 653
226 Vermont White 435
207 South Dakota White 381
172 North Dakota White 304
In [25]:
unsolved=data.loc[data["Solved"]=='No']
age_state = data.groupby(['State','VicAge'])["CaseID"].count().reset_index()
age2=age_state.sort_values(["CaseID","VicAge"], ascending=False)
vicagebystate=age2.drop_duplicates(["State"])
vicagebystate=vicagebystate.rename(columns={'CaseID':'Murders','VicAge': 'VictimAge'})
vicagebystate.head(51)
Out[25]:
State VictimAge Murders
392 California 20 4749
3414 New York 25 2549
4812 Texas 22 1939
1425 Illinois 19 1479
1052 Florida 25 1356
2297 Michigan 21 1236
1912 Louisiana 22 951
1150 Georgia 23 866
3602 North Carolina 22 843
2101 Maryland 20 824
3773 Ohio 20 806
0 Alabama 0 708
5088 Virginia 22 693
2589 Missouri 21 676
4075 Pennsylvania 21 667
4619 Tennessee 23 627
1530 Indiana 23 523
199 Arizona 22 521
4441 South Carolina 25 480
869 District of Columbia 20 402
3877 Oklahoma 25 339
3209 New Jersey 21 321
480 Colorado 1 316
2199 Massachusetts 20 314
5187 Washington 22 297
5367 Wisconsin 1 292
2494 Mississippi 23 281
1820 Kentucky 25 279
298 Arkansas 23 256
2383 Minnesota 1 207
2932 Nevada 19 203
598 Connecticut 22 198
3958 Oregon 1 163
3308 New Mexico 23 161
1709 Kansas 1 146
4898 Utah 1 144
5294 West Virginia 29 115
1615 Iowa 1 110
100 Alaska 1 72
2826 Nebraska 1 72
1323 Idaho 1 69
4270 Rhode Island 20 68
1250 Hawaii 24 60
788 Delaware 29 54
5463 Wyoming 1 53
4522 South Dakota 1 38
1996 Maine 1 37
3010 New Hampshire 1 34
2675 Montana 1 29
4989 Vermont 1 23
3686 North Dakota 1 22
In [26]:
weapon_counts = data["Weapon"].value_counts().reset_index()
weaponcounts2 = weapon_counts.rename(columns={'Weapon':'Murders'})
weapon=weaponcounts2.rename(columns={"index":"Weapon"})
weapon
Out[26]:
Weapon Murders
0 Handgun - pistol, revolver, etc 370146
1 Knife or cutting instrument 112692
2 Firearm, type not stated 57238
3 Personal weapons, includes beating 43819
4 Shotgun 38196
5 Other or type unknown 37552
6 Blunt object - hammer, club, etc 34176
7 Rifle 28626
8 Strangulation - hanging 9800
9 Fire 7453
10 Asphyxiation - includes death by gas 4597
11 Other gun 2680
12 Narcotics or drugs, sleeping pills 1979
13 Drowning 1461
14 Explosives 602
15 Poison - does not include gas 521
16 Pushed or thrown out window 247
In [27]:
weapons=weapon.set_index("Weapon")
weapons2=weapons.head(20)
weapons2.plot(kind='bar',y='Murders',figsize=(15,15),fontsize=15,legend=False,color="red",)
plt.xlabel("Weapon",fontsize=30)
plt.ylabel('Murders',fontsize=30)
plt.ylim(ymax=125000)
plt.title('Murder Weapons',fontsize=40)
plt.tight_layout()
plt.savefig('mweapons.png',transparent=True)
plt.show()