import pandas as pd
import sys
# read in the data into a pandas data frame
avRep = "./example_data/reputation.data"
av = pd.read_csv(avRep, sep="#", header=None)

# make smarter column names
av.columns = ["IP","Reliability","Risk","Type","Country", "Locale","Coords","x"]
av


av['Reliability'].describe()

count    258626.000000
mean          2.798040
std           1.130419
min           1.000000
25%           2.000000
50%           2.000000
75%           4.000000
max          10.000000
Name: Reliability, dtype: float64


av['Risk'].describe()

count    258626.000000
mean          2.221362
std           0.531571
min           1.000000
25%           2.000000
50%           2.000000
75%           2.000000
max           7.000000
Name: Risk, dtype: float64


def factor_col(col):
    factor = pd.Categorical(col)
    return pd.value_counts(factor,sort=False)

rel_ct = pd.value_counts(av['Reliability'])
risk_ct = pd.value_counts(av['Risk'])
type_ct = pd.value_counts(av['Type'])
country_ct = pd.value_counts(av['Country'])

print ("--- Reliability ---")
print (factor_col(av['Reliability']))
print ("\n ")
print ("--- Risk ---")
print (factor_col(av['Risk']))
print ("\n ")
print ("--- Type ---")
print (factor_col(av['Type']).head(n=10))
print ("\n ")
print ("--- Country ---")
print (factor_col(av['Country']).head(n=10))
print ("\n ")

--- Reliability ---
1       5612
2     149117
3      10892
4      87040
5          7
6       4758
7        297
8         21
9        686
10       196
dtype: int64

 
--- Risk ---
1        39
2    213852
3     33719
4      9588
5      1328
6        90
7        10
dtype: int64

 
--- Type ---
APT;Malware Domain                  1
C&C                               610
C&C;Malware Domain                 31
C&C;Malware IP                     20
C&C;Scanning Host                   7
Malicious Host                   3770
Malicious Host;Malware Domain       4
Malicious Host;Malware IP           2
Malicious Host;Scanning Host      163
Malware Domain                   9274
dtype: int64

 
--- Country ---
A1     267
A2       2
AE    1827
AL       4
AM       6
AN       3
AO     256
AR    3046
AT      51
AU     155
dtype: int64


import matplotlib.pyplot as plt
import seaborn as sb

# sort by country
country_ct = pd.value_counts(av['Country'])

# plot the data
plt.axes(frameon=0) # reduce chart junk
country_ct[:20].plot(kind='bar', rot=0, title="Summary By Country", figsize=(20,10)).grid(False)
plt.show()


plt.axes(frameon=0) # reduce chart junk
factor_col(av['Reliability']).plot(kind='bar', rot=0, title="Summary By 'Reliability'", figsize=(20,10)).grid(False)
plt.show()


plt.axes(frameon=0) # reduce chart junk
factor_col(av['Risk']).plot(kind='bar', rot=0, title="Summary By 'Risk'", figsize=(20,10)).grid(False)
plt.show()


top10 = pd.value_counts(av['Country'])[0:9] 
# calculate the % for each of the top 10
top10.astype(float) / len(av['Country'])

CN    0.265182
US    0.194826
TR    0.053970
DE    0.038484
NL    0.030666
RU    0.024537
GB    0.024332
IN    0.021189
FR    0.021069
Name: Country, dtype: float64


from matplotlib import cm
from numpy import arange

print(pd.crosstab(av['Risk'], av['Reliability']).to_string())

Reliability    1       2     3      4   5     6    7   8    9   10
Risk                                                              
1               0       0    16      7   0     8    8   0    0   0
2             804  149114  3670  57653   4  2084   85  11  345  82
3            2225       3  6668  22168   2  2151  156   7  260  79
4            2129       0   481   6447   0   404   43   2   58  24
5             432       0    55    700   1   103    5   1   20  11
6              19       0     2     60   0     8    0   0    1   0
7               3       0     0      5   0     0    0   0    2   0


# graphical view of contingency table (swapping risk/reliability)
xtab = pd.crosstab(av['Reliability'], av['Risk'])
fig = plt.figure(figsize=(5,5))
plt.pcolormesh(xtab,cmap=cm.Greens, figure=fig)
plt.yticks(arange(0.5,len(xtab.index), 1),xtab.index)
plt.xticks(arange(0.5,len(xtab.columns), 1),xtab.columns)
plt.colorbar()
plt.title("Risk ~ Reliability")
plt.show()


# create new column as a copy of Type column
av['newtype'] = av['Type']

# replace multi-Type entries with “Multiples”
av[av['newtype'].str.contains(";")] = "Multiples"

# setup new crosstab structures
typ = av['newtype']
rel = av['Reliability']
rsk = av['Risk']

# compute crosstab making it split on the
# new “type” column
xtab = pd.crosstab(typ, [ rel, rsk ], rownames=['typ'], colnames=['rel', 'rsk'])

print (xtab.to_string())

rel                     1                               2      3                          4                                  5        6                        7                  8             9                     10            Multiples
rsk                     2     3     4    5   6  7       2  3   1     2     3    4   5  6  1      2      3     4    5   6  7  2  3  5  1    2    3    4   5  6  1   2    3   4  5  2  3  4  5    2    3   4   5  6  7   2   3   4  5 Multiples
typ                                                                                                                                                                                                                                          
C&C                     0     0     1    2   1  0       0  0   0     0     0  313  22  2  0      0      0    15   22   4  1  0  0  1  0    0    0   98  60  5  0   0    0   7  3  0  0  1  1    0    0  19  16  1  1   0   1   8  5         0
Malicious Host          0     6    51   41   8  1       0  0   1   206  2250    7   2  0  0    152    512   336  138  30  2  1  0  0  1    3    8    8   4  0  0   0    0   0  0  0  0  0  0    0    2   0   0  0  0   0   0   0  0         0
Malware Domain         12     1     0    0   0  0    7309  0   2   246    55    2   1  0  0     60     18     2    0   0  0  2  1  0  2  921  273   26   2  0  3  72   13   0  0  7  1  1  0  135   38   6   0  0  0  54   7   2  0         0
Malware IP              0    23    11   15  10  2       0  3  12   415  4091   71   6  0  1    132    205   122   45  13  2  0  1  0  3   10  793  133  11  3  5   0  140  35  0  0  6  0  0    1   74  10   0  0  0   0  53  11  2         0
Malware distribution    0     0     0    0   0  0       0  0   0     0     1    0   0  0  0      0      0     0    0   0  0  0  0  0  0    0    0    0   0  0  0   0    0   0  0  0  0  0  0    0    0   0   0  0  0   0   0   0  0         0
Multiples               0     0     0    0   0  0       0  0   0     0     0    0   0  0  0      0      0     0    0   0  0  0  0  0  0    0    0    0   0  0  0   0    0   0  0  0  0  0  0    0    0   0   0  0  0   0   0   0  0       834
Scanning Host         790  2189  2056  366   0  0  141543  0   1  2685   159   35  13  0  6  55654  21325  5931  488  13  0  1  0  0  2  611  107   23   1  0  0   0    0   0  0  2  0  0  0  150   22   7   0  0  0   0   0   0  0         0
Spamming                1     2     9    7   0  0       1  0   0    22     9   17   6  0  0   1536     40    21    4   0  0  0  0  0  0  512  931  106  17  0  0   4    1   0  2  1  0  0  0   52  120  15   3  0  0  24  17   3  4         0


xtab.plot(kind='bar',legend=False, title="Risk ~ Reliabilty | Type", figsize=(20,10)).grid(False)
plt.show()


# Here we remove Scanning Host
rrt_df = av[av['newtype'] != "Scanning Host"]

# And then we do the chart again
typ = rrt_df['newtype']
rel = rrt_df['Reliability']
rsk = rrt_df['Risk']
xtab = pd.crosstab(typ, [ rel, rsk ],  rownames=['typ'], colnames=['rel', 'rsk'])
xtab.plot(kind='bar',legend=False,  title="Risk ~ Reliabilty | Type", figsize=(20,10)).grid(False)
plt.show()


rrt_df = rrt_df[rrt_df['newtype'] != "Malware distribution" ]
rrt_df = rrt_df[rrt_df['newtype'] != "Malware Domain" ]
typ = rrt_df['newtype']
rel = rrt_df['Reliability']
rsk = rrt_df['Risk']
xtab = pd.crosstab(typ, [ rel, rsk ], rownames=['typ'], colnames=['rel', 'rsk'])

print ("Count: %d; Percent: %2.1f%%" % (len(rrt_df), (float(len(rrt_df)) / len(av)) * 100))
## Count: 15171; Percent: 5.9%

xtab.plot(kind='bar',legend=False, title="Risk ~ Reliabilty | Type", figsize=(20,10)).grid(False)

plt.show()

xtab

Count: 15171; Percent: 5.9%

	IP	Reliability	Risk	Type	Country	Locale	Coords	x
0	222.76.212.189	4	2	Scanning Host	CN	Xiamen	24.4797992706,118.08190155	11
1	222.76.212.185	4	2	Scanning Host	CN	Xiamen	24.4797992706,118.08190155	11
2	222.76.212.186	4	2	Scanning Host	CN	Xiamen	24.4797992706,118.08190155	11
3	5.34.246.67	6	3	Spamming	US	NaN	38.0,-97.0	12
4	178.94.97.176	4	5	Scanning Host	UA	Merefa	49.8230018616,36.0507011414	11
...	...	...	...	...	...	...	...	...
258621	179.244.194.219	4	2	Spamming	BR	NaN	-10.0,-55.0	12
258622	216.99.159.166	4	2	Scanning Host	US	Walnut	34.0115013123,-117.853500366	11
258623	216.99.159.169	3	2	Scanning Host	US	Walnut	34.0115013123,-117.853500366	11
258624	216.99.159.176	3	2	Scanning Host	US	Walnut	34.0115013123,-117.853500366	11
258625	216.99.159.117	3	3	Scanning Host	US	Walnut	34.0115013123,-117.853500366	11

UFCFEL-15-M Cyber Security Analytics¶

Practical Lab 1: Hello, Security Analytics¶

Loading in Data¶

Descriptive Statistics¶

Ploting our data¶

Further Exploration¶

rel	1						2		3		...	9					10				Multiples
rsk	2	3	4	5	6	7	2	3	1	2	...	3	4	5	6	7	2	3	4	5	Multiples
typ
C&C	0	0	1	2	1	0	0	0	0	0	...	0	19	16	1	1	0	1	8	5	0
Malicious Host	0	6	51	41	8	1	0	0	1	206	...	2	0	0	0	0	0	0	0	0	0
Malware IP	0	23	11	15	10	2	0	3	12	415	...	74	10	0	0	0	0	53	11	2	0
Multiples	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	834
Spamming	1	2	9	7	0	0	1	0	0	22	...	120	15	3	0	0	24	17	3	4	0