Messing with data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
In [2]:
from matplotlib import pyplot as plt
from scipy import stats
In [5]:
data = pd.read_csv('clean_murder_data.csv')
# stateAbbr = pd.read_csv("StatesAbbr.csv")

data.head()
/Users/randi/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (15,19) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[5]:
ID CNTYFIPS Ori State Agency Agentype Source Solved Year StateName ... OffEthnic Weapon Relationship Circumstance Subcircum VicCount OffCount FileDate fstate MSA
0 197601001AKASP00 Juneau, AK AKASP00 Alaska State Troopers Primary state LE FBI Yes 1976 ALASKA ... Unknown or not reported Knife or cutting instrument Husband Other arguments NaN 0 0 30180 Alaska Rural Alaska
1 197601001AL00102 Jefferson, AL AL00102 Alabama Birmingham Municipal police FBI Yes 1976 ALA ... Unknown or not reported Shotgun Acquaintance Felon killed by private citizen Felon killed in commission of a crime 0 0 30180 Alabama Birmingham-Hoover, AL
2 197601001AL00104 Jefferson, AL AL00104 Alabama Fairfield Municipal police FBI Yes 1976 ALA ... Unknown or not reported Shotgun Wife Other NaN 0 0 30180 Alabama Birmingham-Hoover, AL
3 197601001AL00106 Jefferson, AL AL00106 Alabama Leeds Municipal police FBI Yes 1976 ALA ... Unknown or not reported Knife or cutting instrument Brother Other arguments NaN 0 0 30180 Alabama Birmingham-Hoover, AL
4 197601001AL00201 Mobile, AL AL00201 Alabama Mobile Municipal police FBI Yes 1976 ALA ... Unknown or not reported Strangulation - hanging Acquaintance Circumstances undetermined NaN 0 0 30180 Alabama Mobile, AL

5 rows × 32 columns

In [6]:
print(data.columns)
print(stateAbbr.columns)
relationship_df = data.groupby(['Relationship']).CaseID.count().reset_index()
relationship_df
Index(['ID', 'CNTYFIPS', 'Ori', 'State', 'Agency', 'Agentype', 'Source',
       'Solved', 'Year', 'StateName', 'Month', 'Incident', 'ActionType',
       'Homicide', 'Situation', 'VicAge', 'VicSex', 'VicRace', 'VicEthnic',
       'OffAge', 'OffSex', 'OffRace', 'OffEthnic', 'Weapon', 'Relationship',
       'Circumstance', 'Subcircum', 'VicCount', 'OffCount', 'FileDate',
       'fstate', 'MSA'],
      dtype='object')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-8f83de2b1d42> in <module>()
      1 print(data.columns)
----> 2 print(stateAbbr.columns)
      3 relationship_df = data.groupby(['Relationship']).CaseID.count().reset_index()
      4 relationship_df

NameError: name 'stateAbbr' is not defined

Creating State database

In [ ]:
# Total Murders by state since 1976
totalMurders = data.groupby('State')['CaseID'].count()
print(totalMurders.head())
totalMurders_df = totalMurders.reset_index().rename(columns={'CaseID':'TotalMurders'})
print(totalMurders_df.head())
In [ ]:
# Number of UNSOLVED homicides by state since 1976
unsolved = data.loc[data['Solved'] == 'No']
unsolved = unsolved.groupby('State')['CaseID'].count()
print(unsolved.head())
unsolved_df = unsolved.reset_index().rename(columns={'CaseID':'TotalUnsolved'})
print(unsolved_df.head())
In [ ]:
# Percent of UNSOLVED homicides by state since 1976
percentUnsolved = (unsolved/totalMurders)*100
percentUnsolved = percentUnsolved.round(1)
print(percentUnsolved.round(1).head())
percentUnsolved_df = pd.DataFrame(percentUnsolved).reset_index().rename(columns={'CaseID':'PercentUnsolved'})
print(percentUnsolved_df.head())
In [ ]:
# Number of SOLVED homicides by state since 1976
solved = data.loc[data['Solved'] == 'Yes']
solved = solved.groupby('State')['CaseID'].count()
print(solved.head())
solved_df = solved.reset_index().rename(columns={'CaseID':'TotalSolved'})
print(solved_df.head())
In [ ]:
# Percent of SOLVED homicides by state since 1976
percentSolved = (solved/totalMurders)*100
percentSolved = percentSolved.round(1)
print(percentSolved.head())
percentSolved_df = pd.DataFrame(percentSolved).reset_index().rename(columns={'CaseID':'PercentSolved'})
print(percentSolved_df.head())
In [ ]:
# stateWeapon= data.groupby(['State','Weapon'])["CaseID"].count().reset_index()
# print(stateWeapon.head())
# stateWeapon = stateWeapon.sort_values(["CaseID"], ascending=False)
# stateWeapon_df = stateWeapon.drop_duplicates(["State"])
# print(stateWeapon_df.head())
In [ ]:
# Merge all State data to create final csv
state_db = stateAbbr.merge(totalMurders_df,on="State")
state_db = state_db.merge(unsolved_df, on="State")
state_db = state_db.merge(percentUnsolved_df, on="State")
state_db = state_db.merge(solved_df, on="State")
state_db = state_db.merge(percentSolved_df, on="State")
print(state_db.head())
state_db.to_csv("state_db.csv", index=False)

Testing for Murder data by month

In [ ]:
def month_string_to_number(string):
    m = {
        'jan': 1,
        'feb': 2,
        'mar': 3,
        'apr':4,
         'may':5,
         'jun':6,
         'jul':7,
         'aug':8,
         'sep':9,
         'oct':10,
         'nov':11,
         'dec':12
        }
    s = string.strip()[:3].lower()

    try:
        out = m[s]
        return out
    except:
        raise ValueError('Not a month')
In [ ]:
stateMurdersByMonth = data.groupby(['State','Month'])['CaseID'].count().reset_index()
#print(stateMurdersByMonth.head())
stateMurdersByMonth['MonthNums']=stateMurdersByMonth['Month'].map(month_string_to_number)
stateMurdersByMonth = stateMurdersByMonth.rename(columns={'CaseID':'TotalMurders'})
stateMurdersByMonth = stateMurdersByMonth.sort_values(['State',"MonthNums"], ascending=True)

plotala=stateMurdersByMonth.loc[stateMurdersByMonth['State']=='North Carolina']
print(plotala)
plt.errorbar(plotala.MonthNums, plotala['TotalMurders'], 
                 linestyle='--', marker='.', markersize=8)
In [ ]:
pivot = stateMurdersByMonth.pivot(index='MonthNums', columns='State',values='TotalMurders')
print(pivot.head())
pivot = pivot.reset_index()
for state in stateAbbr['State']:
    plt.errorbar(pivot.MonthNums, pivot[state], fmt='2', 
                 linestyle='--', marker='.', markersize=8,capsize=4)
plt.show()
In [ ]:
relationship_df = data.groupby(['Relationship']).CaseID.count().reset_index()
relationship_df
In [ ]:
weapon_df = data.groupby(['Weapon']).CaseID.count().reset_index()
weapon_df
In [ ]:
circumstance_df = data.groupby(['Circumstance']).CaseID.count().reset_index()
circumstance_df
In [ ]:
year_df = data.groupby(['Year']).CaseID.count().reset_index()
year_df
In [ ]:
NC_df = data.loc[data["StateAbbr"]=="NC"]
agency_df = NC_df.groupby(['Agency']).CaseID.count().reset_index()
agency_df.sort_values(['CaseID'], ascending=False)
In [ ]:
MSA_df = NC_df.groupby(['MSA']).CaseID.count().reset_index()
MSA_df.sort_values(['CaseID'], ascending=False)
In [ ]:
rel_state = data.groupby(['State','Relationship'])["CaseID"].count().reset_index()
relationship2=rel_state.sort_values(["CaseID","Relationship"], ascending=False)
relbystate=relationship2.drop_duplicates(["State"])
relbystate.head(5)
In [ ]:
off_state = data.groupby(['State','OffRace'])["CaseID"].count().reset_index()
off2=off_state.sort_values(["CaseID","OffRace"], ascending=False)
offracebystate=off2.drop_duplicates(["State"])
offracebystate.head(51)
In [ ]:
vic_state = data.groupby(['State','VicRace'])["CaseID"].count().reset_index()
vic2=vic_state.sort_values(["CaseID","VicRace"], ascending=False)
vicracebystate=vic2.drop_duplicates(["State"])
vicracebystate.head(51)
In [ ]:
unsolved=data.loc[data["Solved"]=='No']
age_state = data.groupby(['State','VicAge'])["CaseID"].count().reset_index()
age2=age_state.sort_values(["CaseID","VicAge"], ascending=False)
vicagebystate=age2.drop_duplicates(["State"])
vicagebystate=vicagebystate.rename(columns={'CaseID':'Murders','VicAge': 'VictimAge'})
vicagebystate.head(51)
In [ ]:
vicagebystate.plot(x='VictimAge',y='Murders',kind='Bar')
In [ ]:
weapon_counts = data["Weapon"].value_counts().reset_index()
weaponcounts2 = weapon_counts.rename(columns={'Weapon':'Murders'})
weapon=weaponcounts2.rename(columns={"index":"Weapon"})
weapon
In [ ]:
weapons=weapon.set_index("Weapon")
weapons2=weapons.head(20)
weapons2.plot(kind='bar',y='Murders',figsize=(30,10),fontsize=25,legend=False,color="red")
plt.xlabel("Weapon",fontsize=30)
plt.ylabel('Murders',fontsize=30)
plt.ylim(ymax=125000)
plt.title('Murder Weapons',fontsize=50)
plt.show()
savefig('weapon.png',transparent=True)
In [ ]:
#vic age by state graph bar??
#vic sex of unsolved murders
In [10]:
csv_path = "clean_murder_data.csv"
data_df = pd.read_csv (csv_path)
data_df.head()
/Users/randi/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (15,19) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[10]:
ID CNTYFIPS Ori State Agency Agentype Source Solved Year StateName ... OffEthnic Weapon Relationship Circumstance Subcircum VicCount OffCount FileDate fstate MSA
0 197601001AKASP00 Juneau, AK AKASP00 Alaska State Troopers Primary state LE FBI Yes 1976 ALASKA ... Unknown or not reported Knife or cutting instrument Husband Other arguments NaN 0 0 30180 Alaska Rural Alaska
1 197601001AL00102 Jefferson, AL AL00102 Alabama Birmingham Municipal police FBI Yes 1976 ALA ... Unknown or not reported Shotgun Acquaintance Felon killed by private citizen Felon killed in commission of a crime 0 0 30180 Alabama Birmingham-Hoover, AL
2 197601001AL00104 Jefferson, AL AL00104 Alabama Fairfield Municipal police FBI Yes 1976 ALA ... Unknown or not reported Shotgun Wife Other NaN 0 0 30180 Alabama Birmingham-Hoover, AL
3 197601001AL00106 Jefferson, AL AL00106 Alabama Leeds Municipal police FBI Yes 1976 ALA ... Unknown or not reported Knife or cutting instrument Brother Other arguments NaN 0 0 30180 Alabama Birmingham-Hoover, AL
4 197601001AL00201 Mobile, AL AL00201 Alabama Mobile Municipal police FBI Yes 1976 ALA ... Unknown or not reported Strangulation - hanging Acquaintance Circumstances undetermined NaN 0 0 30180 Alabama Mobile, AL

5 rows × 32 columns

In [11]:
data2=data_df.loc[data_df['Solved']=='No']