| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | import re |
| | from typing import Any, Dict, List, Optional |
| |
|
| | from camel.toolkits import FunctionTool |
| | from camel.toolkits.base import BaseToolkit |
| |
|
| |
|
| | class GoogleScholarToolkit(BaseToolkit): |
| | r"""A toolkit for retrieving information about authors and their |
| | publications from Google Scholar. |
| | |
| | Attributes: |
| | author_identifier (Union[str, None]): The author's Google Scholar URL |
| | or name of the author to search for. |
| | is_author_name (bool): Flag to indicate if the identifier is a name. |
| | (default: :obj:`False`) |
| | scholarly (module): The scholarly module for querying Google Scholar. |
| | author (Optional[Dict[str, Any]]): Cached author details, allowing |
| | manual assignment if desired. |
| | """ |
| |
|
| | def __init__( |
| | self, |
| | author_identifier: str, |
| | is_author_name: bool = False, |
| | use_free_proxies: bool = False, |
| | proxy_http: Optional[str] = None, |
| | proxy_https: Optional[str] = None, |
| | ) -> None: |
| | r"""Initializes the GoogleScholarToolkit with the author's identifier. |
| | |
| | Args: |
| | author_identifier (str): The author's Google Scholar URL or name |
| | of the author to search for. |
| | is_author_name (bool): Flag to indicate if the identifier is a |
| | name. (default: :obj:`False`) |
| | use_free_proxies (bool): Whether to use Free Proxies. |
| | (default: :obj:`False`) |
| | proxy_http ( Optional[str]): Proxy http address pass to pg. |
| | SingleProxy. (default: :obj:`None`) |
| | proxy_https ( Optional[str]): Proxy https address pass to pg. |
| | SingleProxy. (default: :obj:`None`) |
| | """ |
| | from scholarly import ProxyGenerator, scholarly |
| |
|
| | |
| | if use_free_proxies: |
| | pg = ProxyGenerator() |
| | pg.FreeProxies() |
| | scholarly.use_proxy(pg) |
| |
|
| | |
| | if proxy_http or proxy_https: |
| | pg = ProxyGenerator() |
| | pg.SingleProxy(http=proxy_http, https=proxy_https) |
| | scholarly.use_proxy(pg) |
| |
|
| | self.scholarly = scholarly |
| | self.author_identifier = author_identifier |
| | self.is_author_name = is_author_name |
| | self._author: Optional[Dict[str, Any]] = None |
| |
|
| | @property |
| | def author(self) -> Dict[str, Any]: |
| | r"""Getter for the author attribute, fetching details if not cached. |
| | |
| | Returns: |
| | Dict[str, Any]: A dictionary containing author details. If no data |
| | is available, returns an empty dictionary. |
| | """ |
| | if self._author is None: |
| | self.get_author_detailed_info() |
| | return self._author or {} |
| |
|
| | @author.setter |
| | def author(self, value: Optional[Dict[str, Any]]) -> None: |
| | r"""Sets or overrides the cached author information. |
| | |
| | Args: |
| | value (Optional[Dict[str, Any]]): A dictionary containing author |
| | details to cache or `None` to clear the cached data. |
| | |
| | Raises: |
| | ValueError: If `value` is not a dictionary or `None`. |
| | """ |
| | if value is None or isinstance(value, dict): |
| | self._author = value |
| | else: |
| | raise ValueError("Author must be a dictionary or None.") |
| |
|
| | def _extract_author_id(self) -> Optional[str]: |
| | r"""Extracts the author ID from a Google Scholar URL if provided. |
| | |
| | Returns: |
| | Optional[str]: The extracted author ID, or None if not found. |
| | """ |
| | match = re.search(r'user=([A-Za-z0-9-]+)', self.author_identifier) |
| | return match.group(1) if match else None |
| |
|
| | def get_author_detailed_info( |
| | self, |
| | ) -> dict: |
| | r"""Retrieves detailed information about the author. |
| | |
| | Returns: |
| | dict: A dictionary containing detailed information about the |
| | author. |
| | """ |
| | if self.is_author_name: |
| | search_query = self.scholarly.search_author(self.author_identifier) |
| | |
| | first_author_result = next(search_query) |
| | else: |
| | author_id = self._extract_author_id() |
| | first_author_result = self.scholarly.search_author_id(id=author_id) |
| |
|
| | self._author = self.scholarly.fill(first_author_result) |
| | return self._author |
| |
|
| | def get_author_publications( |
| | self, |
| | ) -> List[str]: |
| | r"""Retrieves the titles of the author's publications. |
| | |
| | Returns: |
| | List[str]: A list of publication titles authored by the author. |
| | """ |
| | publication_titles = [ |
| | pub['bib']['title'] for pub in self.author['publications'] |
| | ] |
| | return publication_titles |
| |
|
| | def get_publication_by_title( |
| | self, publication_title: str |
| | ) -> Optional[dict]: |
| | r"""Retrieves detailed information about a specific publication by its |
| | title. Note that this method cannot retrieve the full content of the |
| | paper. |
| | |
| | Args: |
| | publication_title (str): The title of the publication to search |
| | for. |
| | |
| | Returns: |
| | Optional[dict]: A dictionary containing detailed information about |
| | the publication if found; otherwise, `None`. |
| | """ |
| | publications = self.author['publications'] |
| | for publication in publications: |
| | if publication['bib']['title'] == publication_title: |
| | return self.scholarly.fill(publication) |
| | return None |
| |
|
| | def get_full_paper_content_by_link(self, pdf_url: str) -> Optional[str]: |
| | r"""Retrieves the full paper content from a given PDF URL using the |
| | arxiv2text tool. |
| | |
| | Args: |
| | pdf_url (str): The URL of the PDF file. |
| | |
| | Returns: |
| | Optional[str]: The full text extracted from the PDF, or `None` if |
| | an error occurs. |
| | """ |
| | from arxiv2text import arxiv_to_text |
| |
|
| | try: |
| | return arxiv_to_text(pdf_url) |
| | except Exception: |
| | return None |
| |
|
| | def get_tools(self) -> List[FunctionTool]: |
| | r"""Returns a list of FunctionTool objects representing the |
| | functions in the toolkit. |
| | |
| | Returns: |
| | List[FunctionTool]: A list of FunctionTool objects |
| | representing the functions in the toolkit. |
| | """ |
| | return [ |
| | FunctionTool(self.get_author_detailed_info), |
| | FunctionTool(self.get_author_publications), |
| | FunctionTool(self.get_publication_by_title), |
| | FunctionTool(self.get_full_paper_content_by_link), |
| | ] |
| |
|