linkedin-api
linkedin-api copied to clipboard
Profile's interest scraping
Hi, I want to scrap the profile's interest (companies that someone follows). I realize that the API on function get_profile
doesn't return a response regarding the profile's interest. We should hit a different URL to get the profile's interest. I made a function to solve this problem but still doesn't work.
def get_interest(self, public_id=None, urn_id=None):
if urn_id:
profile_urn = f"urn:li:fsd_profile:{urn_id}"
else:
profile = self.get_profile(public_id=public_id)
profile_urn = profile["profile_urn"].replace("fs_miniProfile", "fsd_profile")
res = self._fetch(
f"/graphql?variables=(profileUrn:{profile_urn})"
f"&&queryId=voyagerIdentityDashProfileCards"
f".b0928897b71bd00a5a7291755dcd64f0")
data = res.json()
return data
if data and "status" in data and data["status"] != 200:
self.logger.info("request failed: {}".format(data["message"]))
return {}
return data
I think the problem might be on queryID since it is dynamic. Anyone has an idea about it?
Curious to know if you ever managed to identify a workaround for this?
I was curious as well and scripted something with selenium to achieve this.
`# Import necessary libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import tkinter as tk
from tkinter import simpledialog
import time
from datetime import datetime
from bs4 import BeautifulSoup
import re
import undetected_chromedriver as uc
import pandas as pd
import json
from pprint import pprint
# Adjust pandas display settings
pd.set_option('display.max_colwidth', None)
# Initialize an empty list to store profiles' data
profiles_data = []
# Prompt the user to enter their LinkedIn login credentials.
def get_credentials():
.......
return email, password
# Login to LinkedIn using Selenium.
def login_to_linkedin(driver, email, password):
driver.get('https://www.linkedin.com/login')
driver.find_element(By.ID, 'username').send_keys(email)
driver.find_element(By.ID, 'password').send_keys(password + Keys.RETURN)
time.sleep(5) # Awaiting login completion
# Adjust the given LinkedIn URL to navigate to the specified section.
def adjust_url_for_section(url, section):
base_url = url.split('?')[0]
return base_url + f"/details/{section}/"
# Define the list of tabs to extract data from
tabs = ["Top Voices", "Companies", "Groups", "Newsletters", "Schools"]
def click_tab_and_extract_data(driver, tab_name):
try:
# Click on the tab
tab_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, f"//button[span[contains(text(), '{tab_name}')]]"))
)
tab_element.click()
# Wait for the content to load and extract names or entities
names_elements = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".display-flex.align-items-center.mr1.hoverable-link-text.t-bold span[aria-hidden='true']"))
)
names = [element.text for element in names_elements if element.text.strip() != ''] # Filter out empty names/entities
return names
except:
print(f"Couldn't extract data from {tab_name} tab or tab not present.")
return []
# Create an empty list to store all the flattened profiles
flattened_profiles = []
try:
for url in urls:
if interrupted:
break # Exit the loop if interrupted
driver.get(url)
time.sleep(3)
# Dictionary to store the profile's data
profile_data = {
"LinkedIn URL": url # Add the LinkedIn URL as an identifier
}
.....
# Extract interests
interests_url = adjust_url_for_section(url, 'interests')
driver.get(interests_url)
time.sleep(3)
interests_data = {}
for tab_name in tabs:
interests_data[tab_name] = click_tab_and_extract_data(driver, tab_name)
profile_data['Interests'] = interests_data
# After successfully scraping a profile, add its timestamp to the list
scraped_timestamps.append(datetime.datetime.now())
# Flatten the data and append to the list
flattened_profiles.append(flatten_data(profile_data))
profiles_data.append(profile_data)
# Pretty print the profile data
print("\nExtracted Data for Profile:")
pprint(profile_data)
print("\n" + "-"*50 + "\n")
except KeyboardInterrupt:
interrupted = True
print("Script manually stopped. Data saved up to this point.")
# After all URLs are processed, write all data to Excel file
df_temp = pd.DataFrame(flattened_profiles)
# Read the existing data
df_existing = pd.read_excel(output_file)
# Append the new data
df_final = pd.concat([df_existing, df_temp], ignore_index=True)
# Save the combined data
df_final.to_excel(output_file, index=False)
# Save the extracted data to a JSON file
with open("xxxxxxxxxxxx", 'w', encoding='utf-8') as json_file:
json.dump(profiles_data, json_file, ensure_ascii=False, indent=4)
while True:
time.sleep(10)`