Sudomy icon indicating copy to clipboard operation
Sudomy copied to clipboard

Add support for: transparencyreport.google.com

Open ZeroDot1 opened this issue 6 years ago • 1 comments

https://transparencyreport.google.com/transparencyreport/api/v3/httpsreport/ct/certsearch/page?p=google.com https://transparencyreport.google.com/transparencyreport/api/v3/httpsreport/ct/certsearch?include_expired=true&include_subdomains=true&domain= https://www.google.com/transparencyreport/api/v3/httpsreport/ct/certsearch?domain=

ZeroDot1 avatar Nov 03 '19 21:11 ZeroDot1

# This fetches the data displayed on the Google Safe Browsing Transparency Report and outputs it as a CSV
# that can be imported into a Kaggle dataset.
# The original visualization can be found here: https://transparencyreport.google.com/safe-browsing/overview

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from datetime import datetime, date
RUN_TIME = int(datetime.utcnow().timestamp() * 1000)
START_TIME = datetime.fromtimestamp(1148194800000 // 1000)

# Here are the URL requests I found on the page: 
UNSAFE_URL = f"https://transparencyreport.google.com/transparencyreport/api/v3/safebrowsing/sites?dataset=0&series=malwareDetected,phishingDetected&start=1148194800000&end={RUN_TIME}"
NUMBER_URL = f"https://transparencyreport.google.com/transparencyreport/api/v3/safebrowsing/sites?dataset=1&series=malware,phishing&start=1148194800000&end={RUN_TIME}"
SITES_URL = f"https://transparencyreport.google.com/transparencyreport/api/v3/safebrowsing/sites?start=1148194800000&series=attack,compromised&end={RUN_TIME}"
BROWSER_WARNINGS_URL = f"https://transparencyreport.google.com/transparencyreport/api/v3/safebrowsing/warnings?dataset=users&start=1148194800000&end={RUN_TIME}&series=users"
SEARCH_WARNINGS_URL = f"https://transparencyreport.google.com/transparencyreport/api/v3/safebrowsing/warnings?dataset=search&start=1148194800000&end={RUN_TIME}&series=search"
RESPONSE_TIME_URL = f"https://transparencyreport.google.com/transparencyreport/api/v3/safebrowsing/notify?dataset=1&start=1148194800000&end={RUN_TIME}&series=response"
REINFECTION_URL = f"https://transparencyreport.google.com/transparencyreport/api/v3/safebrowsing/notify?dataset=0&start=1148194800000&end={RUN_TIME}&series=reinfect"

COLUMN_NAMES = [
    "WeekOf",
    "Malware sites detected",
    "Phishing sites detected",
    "Malware sites number",
    "Phishing sites number",
    "Attack sites",
    "Compromised sites",
    "Browser warnings",
    "Search warnings",
    "Webmaster response time",
    "Reinfection rate"
    ]

def load_dataframe():
    dates = pd.date_range(start=START_TIME, end=datetime.fromtimestamp(RUN_TIME // 1000), freq='W', normalize=True)
    df = pd.DataFrame(columns=COLUMN_NAMES)
    df["WeekOf"] = dates
    df = df.set_index("WeekOf")
    return df
    
df = load_dataframe()

import requests
import json
def fetch_as_json(url):
    r = requests.get(url)
    c = r.content
    c = c[5:]
    j = json.loads(c)
    return j[0][1]

def malware_phishing_detected(df):
    pts = fetch_as_json(UNSAFE_URL)
    for pt in pts:
        date = pd.to_datetime(pt[0], unit='ms').normalize()
        malware = pt[1][0]
        phishing = pt[1][1]
        malware = malware[0] if malware else np.NaN
        phishing = phishing[0] if phishing else np.NaN
        df[COLUMN_NAMES[1]][date] = malware
        df[COLUMN_NAMES[2]][date] = phishing
    return df

def malware_phishing_number(df):
    pts = fetch_as_json(NUMBER_URL)
    for pt in pts:
        date = pd.to_datetime(pt[0], unit='ms').normalize()
        malware = pt[1][0]
        phishing = pt[1][1]
        malware = malware[0] if malware else np.NaN
        phishing = phishing[0] if phishing else np.NaN
        df[COLUMN_NAMES[3]][date] = malware
        df[COLUMN_NAMES[4]][date] = phishing
    return df
        
def site_count(df):
    pts = fetch_as_json(SITES_URL)
    for pt in pts:
        date = pd.to_datetime(pt[0], unit='ms').normalize()
        attack = pt[1][0]
        comped = pt[1][1]
        attack = attack[0] if attack else np.NaN
        comped = comped[0] if comped else np.NaN
        df[COLUMN_NAMES[5]][date] = attack
        df[COLUMN_NAMES[6]][date] = comped
    return df
    
def browser_warnings(df):
    pts = fetch_as_json(BROWSER_WARNINGS_URL)
    for pt in pts:
        date = pd.to_datetime(pt[0], unit='ms').normalize()
        value = pt[1][0]
        value = value[0] if value else np.NaN
        df[COLUMN_NAMES[7]][date] = value
    return df
    
def search_warnings(df):
    pts = fetch_as_json(SEARCH_WARNINGS_URL)
    for pt in pts:
        date = pd.to_datetime(pt[0], unit='ms').normalize()
        value = pt[1][0]
        value = value[0] if value else np.NaN
        df[COLUMN_NAMES[8]][date] = value
    return df
    
def response_time(df):
    pts = fetch_as_json(RESPONSE_TIME_URL)
    for pt in pts:
        date = pd.to_datetime(pt[0], unit='ms').normalize()
        value = pt[1][0]
        value = value[0] if value else np.NaN
        df[COLUMN_NAMES[9]][date] = value
    return df
    
def reinfection_rate(df):
    pts = fetch_as_json(REINFECTION_URL)
    for pt in pts:
        date = pd.to_datetime(pt[0], unit='ms').normalize()
        value = pt[1][0]
        # Multiply by 100 and cast to int to save space on import.
        value = int(value[1] * 100) if value else np.NaN
        df[COLUMN_NAMES[10]][date] = value
    return df

df = malware_phishing_detected(df)
df = malware_phishing_number(df)
df = site_count(df)
df = browser_warnings(df)
df = search_warnings(df)
df = response_time(df)
df = reinfection_rate(df)
df.to_csv("data.csv", header=True, index=True, index_label="WeekOf")

ZeroDot1 avatar Nov 03 '19 21:11 ZeroDot1