Question posted in Json
Our archive of expertly curated questions and answers provides insights and solutions to common problems related to this popular data interchange format. From parsing and manipulating JSON data to integrating it with various programming languages and web services, our archive has got you covered. Start exploring today and take your JSON skills to the next level

Scrape ssrn.com with R (jsonlite or rvest) – only first 50 results shown

Jochen
January 22, 2023
164 views
0 votes
2 Answers

I am trying to scrape the first 200 entries from https://www.ssrn.com/index.cfm/en/arn/?page=1&sort=0 (title, authors, url, …). I used rvest so far (which worked fine looping over the first 4 pages until this week), and try now to scrape json directly from https://api.ssrn.com/content/v1/bindings/204/papers. Code works fine (see below), but I don’t know how to get more than the first 50 entries, or even display more than 50 entries (out of 43602). Any solution using jsonlite or rvest?

Any help appreciated! Thanks in advance.

library(jsonlite)
json_file <- "https://api.ssrn.com/content/v1/bindings/204/papers"
data <- fromJSON(json_file)
data <- as.data.frame(data)

Tags: json jsonlite r rvest web-scraping

Answers

If you look at the link, you can alter the out parameters count per index. The max output is 200 per index, then map over the sequence of index to get all 43602 entries like so (2-3 min scraping time):

library(tidyverse) 
library(httr2)

get_ssrn <- function(index) {
  cat("Scraping index:", index, "n")
  str_c("https://api.ssrn.com/content/v1/bindings/204/papers?index=", 
        index, "&count=200&sort=0") %>%
    request() %>%
    req_perform() %>%
    resp_body_json(simplifyVector = TRUE) %>%
    pluck("papers") %>%
    as_tibble() 
}

df <- map_dfr(seq(0, 43602, by = 200), get_ssrn)

df

# A tibble: 43,602 × 13
   abstract_…¹ publi…² is_paid refer…³ page_…⁴ title authors affil…⁵     id is_ap…⁶ appro…⁷ downl…⁸
   <chr>       <chr>   <lgl>   <chr>     <int> <chr> <list>  <chr>    <int> <lgl>   <chr>     <int>
 1 Working Pa… UNDER … FALSE   ""           68 "Is … <df>    "Conco… 4.33e6 TRUE    20 Jan…      27
 2 Working Pa… UNDER … FALSE   ""           58 "The… <df>    "Unive… 4.33e6 TRUE    20 Jan…      14
 3 Working Pa… UNDER … FALSE   ""            7 "App… <df>    "Atma … 4.33e6 TRUE    20 Jan…       2
 4 Working Pa… UNDER … FALSE   ""            7 "The… <df>    "Atmaj… 4.33e6 TRUE    20 Jan…       2
 5 Working Pa… UNDER … FALSE   "Afric…       0 "Mer… <df>    "Indep… 4.33e6 TRUE    20 Jan…       0
 6 Working Pa… UNDER … FALSE   ""           22 "Siz… <df>    "Unive… 4.33e6 TRUE    20 Jan…       2
 7 Accepted P… UNDER … FALSE   "Finan…       0 "Bud… <df>    "Norwe… 4.33e6 TRUE    20 Jan…       0
 8 Working Pa… UNDER … FALSE   "Journ…       6 "Fac… <df>    "Open … 4.33e6 TRUE    20 Jan…       2
 9 Working Pa… UNDER … FALSE   ""           34 "Soc… <df>    "Unive… 4.33e6 TRUE    20 Jan…       1
10 Working Pa… UNDER … FALSE   "Manag…       0 "Aud… <df>    "Chu H… 4.33e6 TRUE    20 Jan…       0
# … with 43,592 more rows, 1 more variable: url <chr>, and abbreviated variable names
#   ¹abstract_type, ²publication_status, ³reference, ⁴page_count, ⁵affiliations, ⁶is_approved,
#   ⁷approved_date, ⁸downloads

Keeping papers and authors in 2 separate tables:

library(jsonlite)
library(stringr)
library(dplyr)
library(tidyr)
library(purrr)

MAX_COUNT <- 200
api_templ <- "https://api.ssrn.com/content/v1/bindings/204/papers?index={start_idx}&count={count}&sort=0"

# get the first set and total number of papers
resp <- read_json(str_glue(api_templ, start_idx = 0, count = MAX_COUNT), simplifyVector = T)
resp$total
#> [1] 43602

# overwrite to limit requests while testing
resp$total <- 600

papers <- map_df(seq(MAX_COUNT, resp$total, MAX_COUNT), 
                 ~ read_json(str_glue(api_templ, start_idx = .x, count = MAX_COUNT), simplifyVector = T)$papers)

# add papers from the first response 
papers <- bind_rows(resp$papers, papers)

# authors are in nested tables, unnest while keeping papers id
authors <- papers %>% select(id_paper = id, authors) %>% 
  unnest(authors) %>% 
  rename(id_author = id, url_author = url)

papers <- papers %>% select(-authors) %>% as_tibble()

Result :

head(papers)
#> # A tibble: 6 × 12
#>   abstrac…¹ publi…² is_paid refer…³ page_…⁴ title affil…⁵     id is_ap…⁶ appro…⁷
#>   <chr>     <chr>   <lgl>   <chr>     <int> <chr> <chr>    <int> <lgl>   <chr>  
#> 1 Working … UNDER … FALSE   ""           68 Is B… Concor… 4.33e6 TRUE    20 Jan…
#> 2 Working … UNDER … FALSE   ""           58 The … Univer… 4.33e6 TRUE    20 Jan…
#> 3 Working … UNDER … FALSE   ""            7 Appl… Atma J… 4.33e6 TRUE    20 Jan…
#> 4 Working … UNDER … FALSE   ""            7 The … Atmaja… 4.33e6 TRUE    20 Jan…
#> 5 Working … UNDER … FALSE   "Afric…       0 Merg… Indepe… 4.33e6 TRUE    20 Jan…
#> 6 Working … UNDER … FALSE   ""           22 Size… Univer… 4.33e6 TRUE    20 Jan…
#> # … with 2 more variables: downloads <int>, url <chr>, and abbreviated variable
#> #   names ¹abstract_type, ²publication_status, ³reference, ⁴page_count,
#> #   ⁵affiliations, ⁶is_approved, ⁷approved_date
head(authors)
#> # A tibble: 6 × 5
#>   id_paper id_author last_name  first_name url_author                           
#>      <int>     <int> <chr>      <chr>      <chr>                                
#> 1  4330623    643676 Proelss    Juliane    https://papers.ssrn.com/sol3/cf_dev/…
#> 2  4330623    744422 Schweizer  Denis      https://papers.ssrn.com/sol3/cf_dev/…
#> 3  4330623   3518984 Sevigny    Stephane   https://papers.ssrn.com/sol3/cf_dev/…
#> 4  4330532   1530510 Cunningham Lauren M.  https://papers.ssrn.com/sol3/cf_dev/…
#> 5  4330532   1452555 Hayne      Christie   https://papers.ssrn.com/sol3/cf_dev/…
#> 6  4330532     51250 Neal       Terry L.   https://papers.ssrn.com/sol3/cf_dev/…

# join tables if needed:
left_join(papers, authors, by = c("id" = "id_paper"))
#> # A tibble: 1,903 × 16
#>    abstra…¹ publi…² is_paid refer…³ page_…⁴ title affil…⁵     id is_ap…⁶ appro…⁷
#>    <chr>    <chr>   <lgl>   <chr>     <int> <chr> <chr>    <int> <lgl>   <chr>  
#>  1 Working… UNDER … FALSE   ""           68 Is B… Concor… 4.33e6 TRUE    20 Jan…
#>  2 Working… UNDER … FALSE   ""           68 Is B… Concor… 4.33e6 TRUE    20 Jan…
#>  3 Working… UNDER … FALSE   ""           68 Is B… Concor… 4.33e6 TRUE    20 Jan…
#>  4 Working… UNDER … FALSE   ""           58 The … Univer… 4.33e6 TRUE    20 Jan…
#>  5 Working… UNDER … FALSE   ""           58 The … Univer… 4.33e6 TRUE    20 Jan…
#>  6 Working… UNDER … FALSE   ""           58 The … Univer… 4.33e6 TRUE    20 Jan…
#>  7 Working… UNDER … FALSE   ""           58 The … Univer… 4.33e6 TRUE    20 Jan…
#>  8 Working… UNDER … FALSE   ""            7 Appl… Atma J… 4.33e6 TRUE    20 Jan…
#>  9 Working… UNDER … FALSE   ""            7 The … Atmaja… 4.33e6 TRUE    20 Jan…
#> 10 Working… UNDER … FALSE   "Afric…       0 Merg… Indepe… 4.33e6 TRUE    20 Jan…
#> # … with 1,893 more rows, 6 more variables: downloads <int>, url <chr>,
#> #   id_author <int>, last_name <chr>, first_name <chr>, url_author <chr>, and
#> #   abbreviated variable names ¹abstract_type, ²publication_status, ³reference,
#> #   ⁴page_count, ⁵affiliations, ⁶is_approved, ⁷approved_date

^{Created on 2023-01-22 with reprex v2.0.2}

Please signup or login to give your own answer.

Click here to cancel reply.