Skip to content

Commit

Permalink
Add new data sources
Browse files Browse the repository at this point in the history
  • Loading branch information
akuny committed Feb 23, 2024
1 parent 0a34116 commit 4bcc04c
Show file tree
Hide file tree
Showing 23 changed files with 157,341 additions and 85,229 deletions.
16 changes: 14 additions & 2 deletions builder/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from datetime import datetime
import os


Expand All @@ -9,7 +8,14 @@
'pulse_source_url': 'https://raw.githubusercontent.com/GSA/data/master/dotgov-websites/pulse-subdomains-snapshot-06-08-2020-https.csv',
'dap_source_url': 'https://analytics.usa.gov/data/live/sites-extended.csv',
'omb_source_url': 'https://resources.data.gov/schemas/dcat-us/v1.1/omb_bureau_codes.csv',
'omb_idea_source_url': 'https://raw.githubusercontent.com/GSA/federal-website-index/main/data/dataset/omb_idea.csv',
'2020_eotw_source_url': 'https://raw.githubusercontent.com/GSA/federal-website-index/main/data/dataset/2020_eot.csv',
'usagov_directory_source_url': 'https://raw.githubusercontent.com/GSA/federal-website-index/main/data/dataset/usagov_directory.csv',
'gov_man_22_source_url': 'https://raw.githubusercontent.com/GSA/federal-website-index/main/data/dataset/gov_man-22.csv',
'usacourts_source_url': 'https://raw.githubusercontent.com/GSA/federal-website-index/main/data/dataset/uscourts.csv',
'oira_source_url': 'https://raw.githubusercontent.com/GSA/federal-website-index/main/data/dataset/oira.csv',
'mil_source_url': 'https://raw.githubusercontent.com/GSA/federal-website-index/main/data/dataset/dotmil_websites.csv',
'mil_source_url_2': 'https://raw.githubusercontent.com/GSA/federal-website-index/main/data/dataset/dotmil_websites-2.csv',
'mil_domains_url': 'https://raw.githubusercontent.com/GSA/federal-website-index/main/data/dataset/dotmil_domains.csv',
'other_websites_path': os.path.join(dirname, '../data/dataset/other-websites.csv'),
'ignore_list_begins_path': os.path.join(dirname, '../criteria/ignore-list-begins.csv'),
Expand All @@ -18,6 +24,12 @@
'gov_snapshot_path': os.path.join(dirname, '../data/snapshots/gov.csv'),
'pulse_snapshot_path': os.path.join(dirname, '../data/snapshots/pulse.csv'),
'dap_snapshot_path': os.path.join(dirname, '../data/snapshots/dap.csv'),
'omb_idea_snapshot_path': os.path.join(dirname, '../data/snapshots/omb_idea.csv'),
'2020_eotw_snapshot_path': os.path.join(dirname, '../data/snapshots/2020_eot.csv'),
'usagov_directory_snapshot_path': os.path.join(dirname, '../data/snapshots/usagov_directory.csv'),
'gov_man_22_snapshot_path': os.path.join(dirname, '../data/snapshots/gov_man_22.csv'),
'usacourts_snapshot_path': os.path.join(dirname, '../data/snapshots/usacourts.csv'),
'oira_snapshot_path': os.path.join(dirname, '../data/snapshots/oira.csv'),
'other_snapshot_path': os.path.join(dirname, '../data/snapshots/other.csv'),
'combined_snapshot_path': os.path.join(dirname, '../data/snapshots/combined.csv'),
'remove_ignore_begins_path': os.path.join(dirname, '../data/snapshots/remove-ignore-begins.csv'),
Expand All @@ -30,4 +42,4 @@
'analysis_csv_path': os.path.join(dirname, '../data/site-scanning-target-url-list-analysis.csv'),
'url_df_pre_base_domains_merged': os.path.join(dirname, '../data/test/url_df_pre_base_domains_merged.csv'),
'url_df_post_base_domains_merged': os.path.join(dirname, '../data/test/url_df_post_base_domains_merged.csv'),
}
}
5 changes: 3 additions & 2 deletions builder/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
import requests


def csv_to_df(url):
def csv_to_df(url, has_headers=True):
bytes = requests.get(url).content
df = pd.read_csv(io.StringIO(bytes.decode('utf8')))
header_option = 'infer' if has_headers else None
df = pd.read_csv(io.StringIO(bytes.decode('utf8')), header=header_option)
return df

def round_float(x):
Expand Down
178 changes: 155 additions & 23 deletions builder/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from config import config
from helpers import csv_to_df, round_float, dict_to_csv
import numpy as np
import pandas as pd


Expand All @@ -8,21 +9,42 @@ def fetch_data(analysis):
gov_df = csv_to_df(config['gov_source_url'])
pulse_df = csv_to_df(config['pulse_source_url'])
dap_df = csv_to_df(config['dap_source_url'])
# datasets added in February, 2024
omb_idea_df = csv_to_df(config['omb_idea_source_url'])
eotw_df = csv_to_df(config['2020_eotw_source_url'])
usagov_df = csv_to_df(config['usagov_directory_source_url'], has_headers=False)
gov_man_df = csv_to_df(config['gov_man_22_source_url'], has_headers=False)
usacourts_df = csv_to_df(config['usacourts_source_url'], has_headers=False)
oira_df = csv_to_df(config['oira_source_url'])

other_df = pd.read_csv(config['other_websites_path'])
other_df['source_list_other'] = 'TRUE'

# track length of source datasets
analysis['gov url list length'] = len(gov_df.index)
analysis['pulse url list length'] = len(pulse_df.index)
analysis['dap url list length'] = len(dap_df.index)
analysis['omb idea url list length'] = len(omb_idea_df.index)
analysis['eotw url list length'] = len(eotw_df.index)
analysis['usagov url list length'] = len(usagov_df.index)
analysis['gov_man url list length'] = len(gov_man_df.index)
analysis['usacourts url list length'] = len(usacourts_df.index)
analysis['oira url list length'] = len(oira_df.index)
analysis['other website url list length'] = len(other_df.index)

# create new snapshots of source files
gov_df.to_csv(config['gov_snapshot_path'], index=False)
pulse_df.to_csv(config['pulse_snapshot_path'], index=False)
dap_df.to_csv(config['dap_snapshot_path'], index=False)
omb_idea_df.to_csv(config['omb_idea_snapshot_path'], index=False)
eotw_df.to_csv(config['2020_eotw_snapshot_path'], index=False)
usagov_df.to_csv(config['usagov_directory_snapshot_path'], index=False)
gov_man_df.to_csv(config['gov_man_22_snapshot_path'], index=False)
usacourts_df.to_csv(config['usacourts_snapshot_path'], index=False)
oira_df.to_csv(config['oira_snapshot_path'], index=False)
other_df.to_csv(config['other_snapshot_path'], index=False)
return gov_df, pulse_df, dap_df, other_df, analysis
return gov_df, pulse_df, dap_df, omb_idea_df, eotw_df, usagov_df, gov_man_df, \
usacourts_df, oira_df, other_df, analysis

def format_gov_df(df):
# drop unnecessary columns
Expand All @@ -31,7 +53,7 @@ def format_gov_df(df):
df = df.rename(columns={'Domain name': 'target_url', 'Domain type': 'branch', 'Agency': 'agency', 'Organization name': 'bureau'})
# convert to lowercase
df['target_url'] = df['target_url'].str.lower()
df['base_domain'] = df['target_url']
df['base_domain_gov'] = df['target_url']
df['source_list_federal_domains'] = 'TRUE'
# strip out 'Federal - ' leading string from domain type column for .gov data
df['branch'] = df['branch'].map(lambda x: x.lstrip('Federal - '))
Expand All @@ -48,27 +70,69 @@ def format_pulse_df(df):
'Strict Transport Security (HSTS)', 'Free of RC4/3DES and SSLv2/SSLv3', '3DES', 'RC4', 'SSLv2', 'SSLv3',
'Preloaded'])
# rename columns
df = df.rename(columns={'Domain': 'target_url', 'Base Domain': 'base_domain'})
df = df[['target_url', 'base_domain']]
df = df.rename(columns={'Domain': 'target_url', 'Base Domain': 'base_domain_pulse'})
df = df[['target_url', 'base_domain_pulse']]
df['source_list_pulse'] = 'TRUE'
return df

def format_dap_df(df):
df = df.rename(columns={'domain': 'target_url'})
df['base_domain'] = df['target_url'].map(lambda x: '.'.join(x.split('.')[-2:]))
df['source_list_dap'] = 'TRUE'
return df

def format_omb_idea_df(df):
df = df.rename(columns={'Website': 'target_url', 'Public-Facing': 'omb_idea_public'})
df['source_list_omb_idea'] = 'TRUE'
df['omb_idea_public'] = df['omb_idea_public'].map({'Yes': 'TRUE', 'No': 'FALSE'})
df = df.drop_duplicates()
return df

def format_eotw_df(df):
df = df.rename(columns={'URL': 'target_url'})
df['source_list_eotw'] = 'TRUE'
df = df.drop_duplicates()
return df

def format_usagov_df(df):
df = df.rename(columns={0: 'target_url'})
df['source_list_usagov'] = 'TRUE'
df = df.drop_duplicates()
return df

def format_gov_man_df(df):
df = df.rename(columns={0: 'target_url'})
df['source_list_gov_man'] = 'TRUE'
df = df.drop_duplicates()
return df

def format_usacourts_df(df):
df = df.rename(columns={0: 'target_url'})
df['source_list_usacourts'] = 'TRUE'
df = df.drop_duplicates()
return df

def format_oira_df(df):
df = df.rename(columns={'URL': 'target_url'})
df['source_list_oira'] = 'TRUE'
df = df.drop_duplicates()
return df

def format_other_df(df):
df['base_domain_other'] = df['target_url'].map(lambda x: '.'.join(x.split('.')[-2:]))
df['source_list_other'] = 'TRUE'
return df

def format_source_columns(df):
df['source_list_federal_domains'] = df['source_list_federal_domains'].map(lambda x: 'FALSE' if x == '' else x)
df['source_list_pulse'] = df['source_list_pulse'].map(lambda x: 'FALSE' if x == '' else x)
df['source_list_dap'] = df['source_list_dap'].map(lambda x: 'FALSE' if x == '' else x)
df['source_list_omb_idea'] = df['source_list_omb_idea'].map(lambda x: 'FALSE' if x == '' else x)
df['source_list_eotw'] = df['source_list_eotw'].map(lambda x: 'FALSE' if x == '' else x)
df['source_list_usagov'] = df['source_list_usagov'].map(lambda x: 'FALSE' if x == '' else x)
df['source_list_gov_man'] = df['source_list_gov_man'].map(lambda x: 'FALSE' if x == '' else x)
df['source_list_usacourts'] = df['source_list_usacourts'].map(lambda x: 'FALSE' if x == '' else x)
df['source_list_oira'] = df['source_list_oira'].map(lambda x: 'FALSE' if x == '' else x)
df['source_list_other'] = df['source_list_other'].map(lambda x: 'FALSE' if x == '' else x)
df['omb_idea_public'] = df['omb_idea_public'].map(lambda x: 'FALSE' if x == '' else x)
return df

def merge_agencies(df, agency_df):
Expand Down Expand Up @@ -109,29 +173,64 @@ def format_agency_and_bureau_codes(df):
return df

def get_mil_subset():
df = csv_to_df(config['mil_source_url'])
mil_domains_df = csv_to_df(config['mil_domains_url'])
first_mil_df = csv_to_df(config['mil_source_url'])
second_mil_df = csv_to_df(config['mil_source_url_2'], has_headers=False)
second_mil_df = second_mil_df.rename(columns={0: 'Website'})
df = pd.concat([first_mil_df, second_mil_df], ignore_index=True)

df = df.rename(columns={'Website': 'target_url', 'Agency': 'agency', 'Bureau': 'bureau', 'Branch': 'branch'})
df['branch'] = 'Executive'
df['agency_code'] = 0
df['bureau_code'] = 0
df['source_list_federal_domains'] = 'FALSE'
df['source_list_dap'] = 'FALSE'
df['source_list_pulse'] = 'FALSE'
df['source_list_omb_idea'] = 'FALSE'
df['source_list_eotw'] = 'FALSE'
df['source_list_usagov'] = 'FALSE'
df['source_list_gov_man'] = 'FALSE'
df['source_list_usacourts'] = 'FALSE'
df['source_list_oira'] = 'FALSE'
df['source_list_other'] = 'FALSE'
df['source_list_mil'] = 'TRUE'
df['omb_idea_public'] = 'FALSE'
df['base_domain'] = df['target_url'].map(lambda x: '.'.join(x.split('.')[-2:]))

mil_domains_df = csv_to_df(config['mil_domains_url'])
mil_domains_set = set(mil_domains_df['Domain name'])
df['is_mil'] = df['base_domain'].apply(lambda x: x in mil_domains_set)
df = df[df['is_mil'] == True]
df = df.drop(columns=['is_mil'])
df['top_level_domain'] = 'mil'

# populate agency and bureau columns
mil_domains_df = mil_domains_df.rename(columns={'Domain name': 'base_domain', 'Agency': 'agency', 'Organization name': 'bureau'})

df = df.merge(mil_domains_df, on='base_domain', how='left')
df = df.fillna('')
df['agency'] = ''
df['bureau'] = ''

for idx, row in df.iterrows():
if row['agency_x'] != '':
df.at[idx, 'agency'] = row['agency_x']
else:
df.at[idx, 'agency'] = row['agency_y']

if row['bureau_x'] != '':
df.at[idx, 'bureau'] = row['bureau_x']
else:
df.at[idx, 'bureau'] = row['bureau_y']

# drop temp bureau columns
df = df.drop(columns=['agency_x', 'agency_y', 'bureau_x', 'bureau_y'])

# Reorder columns
df = df[['target_url', 'base_domain', 'top_level_domain', 'branch', 'agency', 'agency_code',
'bureau', 'bureau_code', 'source_list_federal_domains', 'source_list_dap',
'source_list_pulse', 'source_list_other', 'source_list_mil']]
'source_list_pulse', 'source_list_omb_idea', 'source_list_eotw',
'source_list_usagov', 'source_list_gov_man', 'source_list_usacourts',
'source_list_oira', 'source_list_other', 'source_list_mil', 'omb_idea_public']]

return df

Expand All @@ -140,15 +239,30 @@ def get_mil_subset():
analysis = {}

# import data
gov_df_raw, pulse_df_raw, dap_df_raw, other_df_raw, analysis = fetch_data(analysis)
gov_df_raw, pulse_df_raw, dap_df_raw, omb_idea_df_raw, eotw_df_raw, usagov_df_raw, gov_man_df_raw, \
usacourts_df_raw, oira_df_raw, other_df_raw, analysis = fetch_data(analysis)

gov_df = format_gov_df(gov_df_raw)
pulse_df = format_pulse_df(pulse_df_raw)
dap_df = format_dap_df(dap_df_raw)
other_df = format_other_df(other_df_raw)

# Febraury 2024 datasets
omb_idea_df = format_omb_idea_df(omb_idea_df_raw)
eotw_df = format_eotw_df(eotw_df_raw)
usagov_df = format_usagov_df(usagov_df_raw)
gov_man_df = format_gov_man_df(gov_man_df_raw)
usacourts_df = format_usacourts_df(usacourts_df_raw)
oira_df = format_oira_df(oira_df_raw)

# combine all URLs into one column
url_series = pd.concat([gov_df['target_url'], pulse_df['target_url'], dap_df['target_url'], other_df['target_url']])
print("Combining all URLs into one column")
url_series = pd.concat([gov_df['target_url'], pulse_df['target_url'],
dap_df['target_url'], other_df['target_url'],
omb_idea_df['target_url'], eotw_df['target_url'],
usagov_df['target_url'], gov_man_df['target_url'],
usacourts_df['target_url'], oira_df['target_url']])

url_df = pd.DataFrame(url_series)
analysis['combined url list length'] = len(url_df.index)
url_df.to_csv(config['combined_snapshot_path'], index=False)
Expand Down Expand Up @@ -185,24 +299,35 @@ def get_mil_subset():
url_df = url_df.merge(gov_df, on='target_url', how='left')
url_df = url_df.merge(pulse_df, on='target_url', how='left')
url_df = url_df.merge(dap_df, on='target_url', how='left')
url_df = url_df.merge(omb_idea_df, on='target_url', how='left')
url_df = url_df.merge(eotw_df, on='target_url', how='left',)
url_df = url_df.merge(usagov_df, on='target_url', how='left')
url_df = url_df.merge(gov_man_df, on='target_url', how='left')
url_df = url_df.merge(usacourts_df, on='target_url', how='left')
url_df = url_df.merge(oira_df, on='target_url', how='left')
url_df = url_df.merge(other_df, on='target_url', how='left')
url_df = url_df.fillna('')

url_df.to_csv(config['url_df_pre_base_domains_merged'], index=False)

# populate base domain column
url_df['base_domain'] = ''
for idx, row in url_df.iterrows():
if row['base_domain'] == '':
if row['base_domain_x'] != '':
url_df.at[idx, 'base_domain'] = row['base_domain_x']
elif row['base_domain_y'] != '':
url_df.at[idx, 'base_domain'] = row['base_domain_y']
elif row['base_domain_other'] != '':
url_df.at[idx, 'base_domain'] = row['base_domain_other']
if row['base_domain_gov'] != '':
url_df.at[idx, 'base_domain'] = row['base_domain_gov']
elif row['base_domain_pulse'] != '':
url_df.at[idx, 'base_domain'] = row['base_domain_pulse']
else:
url_df.at[idx, 'base_domain'] = '.'.join(row['target_url'].split('.')[-2:])

url_df.to_csv(config['url_df_post_base_domains_merged'], index=False)

# get relevant subset
url_df = url_df[['target_url', 'base_domain', 'branch', 'agency', 'bureau', 'source_list_federal_domains', 'source_list_pulse', 'source_list_dap', 'source_list_other']]
url_df = url_df[['target_url', 'base_domain', 'branch', 'agency', 'bureau',
'source_list_federal_domains', 'source_list_pulse',
'source_list_dap', 'source_list_omb_idea', 'source_list_eotw',
'source_list_usagov', 'source_list_gov_man', 'source_list_usacourts',
'source_list_oira','source_list_other', 'omb_idea_public']]

# format source columns
url_df = format_source_columns(url_df)
Expand All @@ -211,14 +336,16 @@ def get_mil_subset():
url_df[['branch']] = url_df[['branch']].replace('', 'Executive')

# get lookup table of agencies mapped to base domain
agency_df = gov_df[['base_domain', 'agency']]
agency_df = gov_df[['base_domain_gov', 'agency']]
agency_df = agency_df.rename(columns={'base_domain_gov': 'base_domain'})
agency_df = agency_df.drop_duplicates()

# merge in agencies
url_df = merge_agencies(url_df, agency_df)

# get lookup table of bureaus mapped to base domain
bureau_df = gov_df[['base_domain', 'bureau']]
bureau_df = gov_df[['base_domain_gov', 'bureau']]
bureau_df = bureau_df.rename(columns={'base_domain_gov': 'base_domain'})
bureau_df = bureau_df.drop_duplicates()

# merge in bureaus
Expand All @@ -241,12 +368,17 @@ def get_mil_subset():
url_df = format_agency_and_bureau_codes(url_df)

# reorder columns, sort, remove duplicates
url_df = url_df[['target_url', 'base_domain', 'branch', 'agency', 'agency_code', 'bureau', 'bureau_code', 'source_list_federal_domains', 'source_list_dap', 'source_list_pulse', 'source_list_other']]
url_df = url_df[['target_url', 'base_domain', 'branch', 'agency', 'agency_code',
'bureau', 'bureau_code', 'source_list_federal_domains',
'source_list_dap', 'source_list_pulse', 'source_list_omb_idea',
'source_list_eotw', 'source_list_usagov', 'source_list_gov_man',
'source_list_usacourts', 'source_list_oira', 'source_list_other',
'omb_idea_public']]
url_df = url_df.sort_values(by=['base_domain', 'target_url'])
url_df = url_df.drop_duplicates('target_url')

# remove all non-.gov urls
gov_base_domains = set(gov_df.base_domain)
gov_base_domains = set(gov_df.base_domain_gov)
analysis['number of .gov base domains'] = len(gov_base_domains)
url_df['is_gov'] = url_df['base_domain'].apply(lambda x: x in gov_base_domains)
non_gov_df = url_df[url_df['is_gov'] == False]
Expand Down
22 changes: 14 additions & 8 deletions data/site-scanning-target-url-list-analysis.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
question,answer
gov url list length,1349
gov url list length,1350
pulse url list length,24637
dap url list length,8284
omb idea url list length,8493
eotw url list length,8146
usagov url list length,1404
gov_man url list length,5991
usacourts url list length,341
oira url list length,7548
other website url list length,7
combined url list length,35626
deduped url list length,30947
url list length after ignore list checking beginnning of urls processed,29041
url list length after ignore list checking entire url,27767
number of .gov base domains,1349
number of urls with non-.gov base domains removed,1472
url list length after non-federal urls removed,26295
combined url list length,49326
deduped url list length,34333
url list length after ignore list checking beginnning of urls processed,32422
url list length after ignore list checking entire url,31133
number of .gov base domains,1350
number of urls with non-.gov base domains removed,2877
url list length after non-federal urls removed,28256
Loading

0 comments on commit 4bcc04c

Please sign in to comment.