skip to Main Content

I am trying to scrape the first 200 entries from https://www.ssrn.com/index.cfm/en/arn/?page=1&sort=0 (title, authors, url, …). I used rvest so far (which worked fine looping over the first 4 pages until this week), and try now to scrape json directly from https://api.ssrn.com/content/v1/bindings/204/papers. Code works fine (see below), but I don’t know how to get more than the first 50 entries, or even display more than 50 entries (out of 43602). Any solution using jsonlite or rvest?

Any help appreciated! Thanks in advance.

library(jsonlite)
json_file <- "https://api.ssrn.com/content/v1/bindings/204/papers"
data <- fromJSON(json_file)
data <- as.data.frame(data)

2

Answers


  1. If you look at the link, you can alter the out parameters count per index. The max output is 200 per index, then map over the sequence of index to get all 43602 entries like so (2-3 min scraping time):

    library(tidyverse) 
    library(httr2)
    
    get_ssrn <- function(index) {
      cat("Scraping index:", index, "n")
      str_c("https://api.ssrn.com/content/v1/bindings/204/papers?index=", 
            index, "&count=200&sort=0") %>%
        request() %>%
        req_perform() %>%
        resp_body_json(simplifyVector = TRUE) %>%
        pluck("papers") %>%
        as_tibble() 
    }
    
    df <- map_dfr(seq(0, 43602, by = 200), get_ssrn)
    
    df
    
    # A tibble: 43,602 × 13
       abstract_…¹ publi…² is_paid refer…³ page_…⁴ title authors affil…⁵     id is_ap…⁶ appro…⁷ downl…⁸
       <chr>       <chr>   <lgl>   <chr>     <int> <chr> <list>  <chr>    <int> <lgl>   <chr>     <int>
     1 Working Pa… UNDER … FALSE   ""           68 "Is … <df>    "Conco… 4.33e6 TRUE    20 Jan…      27
     2 Working Pa… UNDER … FALSE   ""           58 "The… <df>    "Unive… 4.33e6 TRUE    20 Jan…      14
     3 Working Pa… UNDER … FALSE   ""            7 "App… <df>    "Atma … 4.33e6 TRUE    20 Jan…       2
     4 Working Pa… UNDER … FALSE   ""            7 "The… <df>    "Atmaj… 4.33e6 TRUE    20 Jan…       2
     5 Working Pa… UNDER … FALSE   "Afric…       0 "Mer… <df>    "Indep… 4.33e6 TRUE    20 Jan…       0
     6 Working Pa… UNDER … FALSE   ""           22 "Siz… <df>    "Unive… 4.33e6 TRUE    20 Jan…       2
     7 Accepted P… UNDER … FALSE   "Finan…       0 "Bud… <df>    "Norwe… 4.33e6 TRUE    20 Jan…       0
     8 Working Pa… UNDER … FALSE   "Journ…       6 "Fac… <df>    "Open … 4.33e6 TRUE    20 Jan…       2
     9 Working Pa… UNDER … FALSE   ""           34 "Soc… <df>    "Unive… 4.33e6 TRUE    20 Jan…       1
    10 Working Pa… UNDER … FALSE   "Manag…       0 "Aud… <df>    "Chu H… 4.33e6 TRUE    20 Jan…       0
    # … with 43,592 more rows, 1 more variable: url <chr>, and abbreviated variable names
    #   ¹​abstract_type, ²​publication_status, ³​reference, ⁴​page_count, ⁵​affiliations, ⁶​is_approved,
    #   ⁷​approved_date, ⁸​downloads
    
    Login or Signup to reply.
  2. Keeping papers and authors in 2 separate tables:

    library(jsonlite)
    library(stringr)
    library(dplyr)
    library(tidyr)
    library(purrr)
    
    MAX_COUNT <- 200
    api_templ <- "https://api.ssrn.com/content/v1/bindings/204/papers?index={start_idx}&count={count}&sort=0"
    
    # get the first set and total number of papers
    resp <- read_json(str_glue(api_templ, start_idx = 0, count = MAX_COUNT), simplifyVector = T)
    resp$total
    #> [1] 43602
    
    # overwrite to limit requests while testing
    resp$total <- 600
    
    papers <- map_df(seq(MAX_COUNT, resp$total, MAX_COUNT), 
                     ~ read_json(str_glue(api_templ, start_idx = .x, count = MAX_COUNT), simplifyVector = T)$papers)
    
    # add papers from the first response 
    papers <- bind_rows(resp$papers, papers)
    
    # authors are in nested tables, unnest while keeping papers id
    authors <- papers %>% select(id_paper = id, authors) %>% 
      unnest(authors) %>% 
      rename(id_author = id, url_author = url)
    
    papers <- papers %>% select(-authors) %>% as_tibble()
    

    Result :

    head(papers)
    #> # A tibble: 6 × 12
    #>   abstrac…¹ publi…² is_paid refer…³ page_…⁴ title affil…⁵     id is_ap…⁶ appro…⁷
    #>   <chr>     <chr>   <lgl>   <chr>     <int> <chr> <chr>    <int> <lgl>   <chr>  
    #> 1 Working … UNDER … FALSE   ""           68 Is B… Concor… 4.33e6 TRUE    20 Jan…
    #> 2 Working … UNDER … FALSE   ""           58 The … Univer… 4.33e6 TRUE    20 Jan…
    #> 3 Working … UNDER … FALSE   ""            7 Appl… Atma J… 4.33e6 TRUE    20 Jan…
    #> 4 Working … UNDER … FALSE   ""            7 The … Atmaja… 4.33e6 TRUE    20 Jan…
    #> 5 Working … UNDER … FALSE   "Afric…       0 Merg… Indepe… 4.33e6 TRUE    20 Jan…
    #> 6 Working … UNDER … FALSE   ""           22 Size… Univer… 4.33e6 TRUE    20 Jan…
    #> # … with 2 more variables: downloads <int>, url <chr>, and abbreviated variable
    #> #   names ¹​abstract_type, ²​publication_status, ³​reference, ⁴​page_count,
    #> #   ⁵​affiliations, ⁶​is_approved, ⁷​approved_date
    head(authors)
    #> # A tibble: 6 × 5
    #>   id_paper id_author last_name  first_name url_author                           
    #>      <int>     <int> <chr>      <chr>      <chr>                                
    #> 1  4330623    643676 Proelss    Juliane    https://papers.ssrn.com/sol3/cf_dev/…
    #> 2  4330623    744422 Schweizer  Denis      https://papers.ssrn.com/sol3/cf_dev/…
    #> 3  4330623   3518984 Sevigny    Stephane   https://papers.ssrn.com/sol3/cf_dev/…
    #> 4  4330532   1530510 Cunningham Lauren M.  https://papers.ssrn.com/sol3/cf_dev/…
    #> 5  4330532   1452555 Hayne      Christie   https://papers.ssrn.com/sol3/cf_dev/…
    #> 6  4330532     51250 Neal       Terry L.   https://papers.ssrn.com/sol3/cf_dev/…
    
    # join tables if needed:
    left_join(papers, authors, by = c("id" = "id_paper"))
    #> # A tibble: 1,903 × 16
    #>    abstra…¹ publi…² is_paid refer…³ page_…⁴ title affil…⁵     id is_ap…⁶ appro…⁷
    #>    <chr>    <chr>   <lgl>   <chr>     <int> <chr> <chr>    <int> <lgl>   <chr>  
    #>  1 Working… UNDER … FALSE   ""           68 Is B… Concor… 4.33e6 TRUE    20 Jan…
    #>  2 Working… UNDER … FALSE   ""           68 Is B… Concor… 4.33e6 TRUE    20 Jan…
    #>  3 Working… UNDER … FALSE   ""           68 Is B… Concor… 4.33e6 TRUE    20 Jan…
    #>  4 Working… UNDER … FALSE   ""           58 The … Univer… 4.33e6 TRUE    20 Jan…
    #>  5 Working… UNDER … FALSE   ""           58 The … Univer… 4.33e6 TRUE    20 Jan…
    #>  6 Working… UNDER … FALSE   ""           58 The … Univer… 4.33e6 TRUE    20 Jan…
    #>  7 Working… UNDER … FALSE   ""           58 The … Univer… 4.33e6 TRUE    20 Jan…
    #>  8 Working… UNDER … FALSE   ""            7 Appl… Atma J… 4.33e6 TRUE    20 Jan…
    #>  9 Working… UNDER … FALSE   ""            7 The … Atmaja… 4.33e6 TRUE    20 Jan…
    #> 10 Working… UNDER … FALSE   "Afric…       0 Merg… Indepe… 4.33e6 TRUE    20 Jan…
    #> # … with 1,893 more rows, 6 more variables: downloads <int>, url <chr>,
    #> #   id_author <int>, last_name <chr>, first_name <chr>, url_author <chr>, and
    #> #   abbreviated variable names ¹​abstract_type, ²​publication_status, ³​reference,
    #> #   ⁴​page_count, ⁵​affiliations, ⁶​is_approved, ⁷​approved_date
    

    Created on 2023-01-22 with reprex v2.0.2

    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search