python调用semantic scholar【语义学者】API获取论文信息

CSDN 2024-09-16 16:35:01 阅读 57

原本是想抽取arxiv上面论文中的<code>参考文献信息,但是PDF文件难以解析。固想到用该论文的信息去其他数据库中检索。semantic scholar上面的论文就可以显示出文章的参考文献信息,固调用API实现此目的。总体的流程就是:

根据arxiv-id获取semantic scholar - id通过semantic scholar - id获取该文章的参考文献信息(title、author、time、id)

数据准备

之前已经爬取好了arxiv上关于GCN的论文元数据,见文章👉python爬取arXiv论文元数据

爬取的论文元数据

例如论文链接<code>https://arxiv.org/pdf/2403.02221的最后一串数字2403.02221就是该篇文章的arxiv-id。

根据文章arxiv-id获取semantic scholar - id

import requests

# 设置 arXiv ID

arxiv_id = "2403.00825"

# 构造请求的 URL,使用 arXiv ID 作为参数

url = f"https://api.semanticscholar.org/v1/paper/arXiv:{ -- -->arxiv_id}"

# 发起请求

response = requests.get(url)

# 检查请求是否成功

if response.status_code == 200:

# 解析响应的 JSON 数据

data = response.json()

# 获取并打印 Semantic Scholar 的 ID

semantic_scholar_id = data.get("paperId")

print(f"Semantic Scholar ID: { semantic_scholar_id}")

else:

print(f"请求失败,状态码:{ response.status_code}")

运行代码后输出:Semantic Scholar ID: f15a2d6878429c395e31d738a481fb39e98ca7e2

通过semantic scholar - id 获取该篇文章的参考文献信息(标题、作者、年份和ID)

import requests

semantic_scholar_id = "075f320d8e82673b51204a768d831a17f9999c02"

# 构造请求的URL

url = f"https://api.semanticscholar.org/v1/paper/{ semantic_scholar_id}"

# 发起请求

response = requests.get(url)

# 检查请求是否成功

if response.status_code == 200:

data = response.json()

# 检查是否有引用文献

if "references" in data and len(data["references"]) > 0:

# 打印引用文献的信息,例如标题、作者、时间和Semantic Scholar-ID

for reference in data["references"]:

print(f"Title: { reference.get('title', 'No title available')}")

# 打印每个引用的作者,如果有的话

if "authors" in reference:

authors = ", ".join([author.get("name", "N/A") for author in reference["authors"]])

print(f"Authors: { authors}")

# 打印出版年份

print(f"Year: { reference.get('year', 'No year available')}")

# 打印 Semantic Scholar ID

print(f"Semantic Scholar ID: { reference.get('paperId', 'No ID available')}")

print("-----")

else:

print("暂没有References")

else:

print(f"请求失败,状态码:{ response.status_code}")

输出:

Title: MSNet: Multi-Resolution Synergistic Networks for Adaptive Inference

Authors: Renlong Hang, Xuwei Qian, Qingshan Liu

Year: 2023

Semantic Scholar ID: 46a0dfaa98118728052b9f017940470ba79ce0f1

-----

Title: ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders

Authors: Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In-So Kweon, Saining Xie

Year: 2023

Semantic Scholar ID: 2218f1713d7f721ab76801063416ec9b11c7646f

-----

Title: Dynamic Neural Networks: A Survey

Authors: Yizeng Han, Gao Huang, Shiji Song, Le Yang, Honghui Wang, Yulin Wang

Year: 2021

Semantic Scholar ID: 837ac4ed6825502f0460caec45e12e734c85b113

-----

# 可以列出所有的参考文献,篇幅原因我仅列了3个

另外还有其他的一些功能,下面举两个,再想了解其他更多的信息参考官方文档👉semantic scholar - api官方文档链接

通过semantic scholar - id 获取该篇文章的信息

import requests

paper_id = "f15a2d6878429c395e31d738a481fb39e98ca7e2"

# 构造请求的URL

url = f"https://api.semanticscholar.org/v1/paper/{ paper_id}"

# 发起请求

response = requests.get(url)

# 检查请求是否成功

if response.status_code == 200:

paper_info = response.json()

print(paper_info) # 打印文章信息

else:

print(f"请求失败,状态码:{ response.status_code}")

输出:

{ 'abstract': "Text classification is the task of assigning a document to a predefined class. However, it is expensive to acquire enough labeled documents or to label them. In this paper, we study the regularization methods' effects on various classification models when only a few labeled data are available. We compare a simple word embedding-based model, which is simple but effective, with complex models (CNN and BiLSTM). In supervised learning, adversarial training can further regularize the model. When an unlabeled dataset is available, we can regularize the model using semi-supervised learning methods such as the Pi model and virtual adversarial training. We evaluate the regularization effects on four text classification datasets (AG news, DBpedia, Yahoo! Answers, Yelp Polarity), using only 0.1% to 0.5% of the original labeled training documents. The simple model performs relatively well in fully supervised learning, but with the help of adversarial training and semi-supervised learning, both simple and complex models can be regularized, showing better results for complex models. Although the simple model is robust to overfitting, a complex model with well-designed prior beliefs can be also robust to overfitting.",

'arxivId': '2403.00825',

'authors': [{ 'authorId': '2156939179',

'name': 'Jongga Lee',

'url': 'https://www.semanticscholar.org/author/2156939179'},

{ 'authorId': '2289841708',

'name': 'Jaeseung Yim',

'url': 'https://www.semanticscholar.org/author/2289841708'},

{ 'authorId': '2289841978',

'name': 'Seohee Park',

'url': 'https://www.semanticscholar.org/author/2289841978'},

{ 'authorId': '2290016625',

'name': 'Changwon Lim',

'url': 'https://www.semanticscholar.org/author/2290016625'}],

'citationVelocity': 0,

'citations': [],

'corpusId': 268230995,

'doi': '10.48550/arXiv.2403.00825',

'fieldsOfStudy': ['Computer Science'],

'influentialCitationCount': 0,

'isOpenAccess': False,

'isPublisherLicensed': True,

'is_open_access': False,

'is_publisher_licensed': True,

'numCitedBy': 0,

'numCiting': 0,

'paperId': 'f15a2d6878429c395e31d738a481fb39e98ca7e2',

'references': [],

's2FieldsOfStudy': [{ 'category': 'Computer Science', 'source': 'external'},

{ 'category': 'Computer Science', 'source': 's2-fos-model'}],

'title': 'Comparing effectiveness of regularization methods on text classification: Simple and complex model in data shortage situation',

'topics': [],

'url': 'https://www.semanticscholar.org/paper/f15a2d6878429c395e31d738a481fb39e98ca7e2',

'venue': 'arXiv.org',

'year': 2024}

获取该篇论文的10篇推荐论文

import requests

import json

# 设置 API 的基础 URL

base_url = "https://api.semanticscholar.org/recommendations/v1"

# 指定要请求的论文推荐的 API 路径和参数

paper_id = "075f320d8e82673b51204a768d831a17f9999c02"

path = f"/papers/forpaper/{ paper_id}"

params = {

"limit": 10, # 请求返回的推荐论文数量

"fields": "title,authors,year" # 请求返回的字段

}

# 发起 GET 请求

response = requests.get(f"{ base_url}{ path}", params=params)

# 检查请求是否成功

if response.status_code == 200:

# 解析响应内容

recommendations = response.json()

print(json.dumps(recommendations, indent=2))

else:

print(f"Error: { response.status_code}")

输出:

{

"recommendedPapers": [

{

"paperId": "bd8ee79c28ef2eb55185c6912484847696c0773b",

"title": "SoD2: Statically Optimizing Dynamic Deep Neural Network",

"year": 2024,

"authors": [

{

"authorId": "48643324",

"name": "Wei Niu"

},

{

"authorId": "2289611051",

"name": "Gagan Agrawal"

},

{

"authorId": "2244768705",

"name": "Bin Ren"

}

]

},

{ ..............省略

]

}

通过doi获取论文信息

import requests

doi = "10.1145/3292500.3330925"

# 构造请求的 URL

url = f"https://api.semanticscholar.org/v1/paper/{ doi}"

response = requests.get(url)

if response.status_code == 200:

paper_details = response.json()

print(f"Semantic Scholar ID: { paper_details.get('paperId')}")

print(f"Title: { paper_details.get('title')}")

print(f"Authors: { [author['name'] for author in paper_details.get('authors', [])]}")

print(f"Year of Publication: { paper_details.get('year')}")

print(f"Abstract: { paper_details.get('abstract', 'No abstract available')}")

else:

print(f"Error: Failed to retrieve data, status code { response.status_code}")

输出:

Semantic Scholar ID: 05c4eb154ad9512a69569c18d68bc4428ee8bb83

Title: Cluster-GCN: An Efficient Algorithm for Training Deep and Large Graph Convolutional Networks

Authors: ['Wei-Lin Chiang', 'Xuanqing Liu', 'Si Si', 'Yang Li', 'Samy Bengio', 'Cho-Jui Hsieh']

Year of Publication: 2019

Abstract: Graph convolutional network (GCN) has been successfully applied to many graph-based applications; however, training a large-scale GCN remains challenging. Current SGD-based algorit....省略

邮箱:k1933211129@163.com



声明

本文内容仅代表作者观点,或转载于其他网站,本站不以此文作为商业用途
如有涉及侵权,请联系本站进行删除
转载本站原创文章,请注明来源及作者。