From 4158a6bea5e4c94ad9b5f3538fe621c4783be3a9 Mon Sep 17 00:00:00 2001 From: Sarah Oberbichler <66369271+soberbichler@users.noreply.github.com> Date: Tue, 24 Dec 2024 01:40:25 +0100 Subject: [PATCH] Mit Colab erstellt --- Named_Entity_Recognition_ImpressoAPI.ipynb | 464 +++++++++++++++++++++ 1 file changed, 464 insertions(+) create mode 100644 Named_Entity_Recognition_ImpressoAPI.ipynb diff --git a/Named_Entity_Recognition_ImpressoAPI.ipynb b/Named_Entity_Recognition_ImpressoAPI.ipynb new file mode 100644 index 0000000..265c55b --- /dev/null +++ b/Named_Entity_Recognition_ImpressoAPI.ipynb @@ -0,0 +1,464 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "collapsed_sections": [ + "Lr-phCiuy1aW", + "5m00DEeLz1bs" + ], + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Visualizing Named Entities with Impresso API\n", + "\n", + "The models and corresponding API implementation used in this notebook were developed by Emanuale Boros [![ORCID](data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyMCIgaGVpZ2h0PSIyMCIgdmlld0JveD0iMCAwIDIwIDIwIj4KICA8cmVjdCB3aWR0aD0iMjAiIGhlaWdodD0iMjAiIGZpbGw9IiNGRkZGRkYiLz4KICA8Y2lyY2xlIGN4PSIxMCIgY3k9IjEwIiByPSI5IiBmaWxsPSIjQThDRTNDIi8+CiAgPHRleHQgeD0iMTAiIHk9IjE1IiBmb250LWZhbWlseT0iQXJpYWwsIHNhbnMtc2VyaWYiIGZvbnQtc2l6ZT0iMTEiIGZvbnQtd2VpZ2h0PSJib2xkIiB0ZXh0LWFuY2hvcj0ibWlkZGxlIiBmaWxsPSIjRkZGRkZGIj5pRDwvdGV4dD4KPC9zdmc+)](https://orcid.org/0000-0001-6299-9452) from the Impresso Project (https://impresso-project.ch/). The notebook was created by Sarah Oberbichler for teaching purposes [![ORCID](data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyMCIgaGVpZ2h0PSIyMCIgdmlld0JveD0iMCAwIDIwIDIwIj4KICA8cmVjdCB3aWR0aD0iMjAiIGhlaWdodD0iMjAiIGZpbGw9IiNGRkZGRkYiLz4KICA8Y2lyY2xlIGN4PSIxMCIgY3k9IjEwIiByPSI5IiBmaWxsPSIjQThDRTNDIi8+CiAgPHRleHQgeD0iMTAiIHk9IjE1IiBmb250LWZhbWlseT0iQXJpYWwsIHNhbnMtc2VyaWYiIGZvbnQtc2l6ZT0iMTEiIGZvbnQtd2VpZ2h0PSJib2xkIiB0ZXh0LWFuY2hvcj0ibWlkZGxlIiBmaWxsPSIjRkZGRkZGIj5pRDwvdGV4dD4KPC9zdmc+)](https://orcid.org/0000-0002-1031-2759).\n", + "\n", + "##About this Notebook\n", + "\n", + "This notebook enables the extraction of named entities from any dataset using the Impresso NER and Named Entity Linking (NEL) API, providing precise recognition of fine-grained named entities in historical texts across German, French, and English. Furthermore, it demonstrates how to visualize location entities by generating geographic coordinates for the extracted locations and plotting them on an interactive map, enhancing both analysis and presentation.\n", + "\n", + "**Acknowledgements:**\n", + "\n", + "For further experimentation, you can directly access the experimental API at [Impresso Annotation](https://impresso-annotation.epfl.ch/).\n" + ], + "metadata": { + "id": "CFDuZVrUhJR3" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Call the Impresso NER and NEL API\n", + "We only use the NER part for this notebook." + ], + "metadata": { + "id": "k8VurD5Hyy7g" + } + }, + { + "cell_type": "code", + "source": [ + "import requests\n", + "\n", + "def get_linked_entities(text, coarse_only=False):\n", + " \"\"\"\n", + " Calls the external API to get named entity recognition (NER) results.\n", + " \"\"\"\n", + " url = \"https://impresso-annotation.epfl.ch/api/ner/\"\n", + " payload = {\"data\": text}\n", + " try:\n", + " response = requests.post(url, json=payload)\n", + " if response.status_code == 200:\n", + " data = response.json()\n", + " data[\"text\"] = text\n", + " # remove fine-grained and components\n", + " if coarse_only:\n", + " for ne in data[\"nes\"]:\n", + " data[\"nes\"] = [ne for ne in data[\"nes\"] if not \".\" in ne[\"type\"]]\n", + " return data\n", + " else:\n", + " print(f\"Request failed with status code {response.status_code}\")\n", + " return None\n", + " except requests.exceptions.RequestException as e:\n", + " print(f\"An error occurred: {e}\")\n", + " return None\n" + ], + "metadata": { + "id": "Lhyczl3eywiV" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "##Importing a Dataset" + ], + "metadata": { + "id": "iUKvkcyPbwYK" + } + }, + { + "cell_type": "code", + "source": [ + "!git clone https://github.com/ieg-dhr/NLP-Course4Humanities_2024.git" + ], + "metadata": { + "id": "sAl9x2AAb0wv" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "articles_df = pd.read_excel('/content/NLP-Course4Humanities_2024/datasets/earthquake_articles.xlsx')\n", + "\n", + "articles_df = articles_df[:20]\n", + "articles_df.head()" + ], + "metadata": { + "id": "Zc8inLMXcA4z" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import requests\n", + "import pandas as pd\n", + "import json\n", + "\n", + "def get_linked_entities(text, coarse_only=False):\n", + " \"\"\"\n", + " Calls the external API to get named entity recognition (NER) results.\n", + " \"\"\"\n", + " url = \"https://impresso-annotation.epfl.ch/api/ner/\"\n", + " payload = {\"data\": text}\n", + " try:\n", + " response = requests.post(url, json=payload)\n", + " if response.status_code == 200:\n", + " data = response.json()\n", + " data[\"text\"] = text\n", + " # remove fine-grained and components\n", + " if coarse_only:\n", + " data[\"nes\"] = [ne for ne in data[\"nes\"] if not \".\" in ne[\"type\"]]\n", + " return data\n", + " else:\n", + " print(f\"Request failed with status code {response.status_code}\")\n", + " return None\n", + " except requests.exceptions.RequestException as e:\n", + " print(f\"An error occurred: {e}\")\n", + " return None\n", + "\n", + "# Assuming 'extracted_article_clean' is the column name in your DataFrame\n", + "def process_dataframe(df):\n", + " results = []\n", + " for index, row in df.iterrows():\n", + " text = row['extracted_article_clean']\n", + " api_result = get_linked_entities(text, coarse_only=True)\n", + " results.append(api_result)\n", + " df['NER_results'] = results\n", + " return df\n", + "\n", + "articles_df = process_dataframe(articles_df)\n", + "articles_df.head()" + ], + "metadata": { + "id": "UYW6HMFDdFB3" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# export as excel file\n", + "articles_df.to_excel('earthquake_articles_with_ner.xlsx', index=False)" + ], + "metadata": { + "id": "S2w08o3EdZs2" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "##Extracting Locations from the NER Output - An Example\n", + "\n", + "We can extract specific entities in order to use them for further analysis or visualization. Please not that when using extracted named entities for further analysis, they need to be controlled and verified by a human reader since the model most likely has made some mistakes." + ], + "metadata": { + "id": "pqEqYPu_BLq6" + } + }, + { + "cell_type": "code", + "source": [ + "#extract locations\n", + "\n", + "import pandas as pd\n", + "\n", + "def extract_locations(df):\n", + " places = []\n", + " for index, row in df.iterrows():\n", + " ner_results = row['NER_results']\n", + " if ner_results and 'nes' in ner_results:\n", + " # Use 'surface' instead of 'text' to get the entity text\n", + " location_names = [ne['surface'] for ne in ner_results['nes'] if ne.get('type') == 'loc']\n", + " places.append(', '.join(location_names)) # Join multiple locations\n", + " else:\n", + " places.append('') # Handle cases where NER_results is None or nes key is missing\n", + " df['places'] = places\n", + " return df\n", + "\n", + "articles_df = extract_locations(articles_df)\n", + "print(articles_df[['extracted_article_clean', 'places']].head())" + ], + "metadata": { + "id": "0KBxbwfId_3E" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Visualization Example - Creating a Map with Place Names\n", + "\n", + "We first use the geopy library to process geographic locations and add their corresponding coordinates (latitude and longitude) to a pandas DataFrame. It includes a GeocodingService class that interfaces with the Nominatim geocoding API, implementing rate-limiting, retries with exponential backoff, and error handling to ensure robust geocoding.\n", + "\n", + "We further use the folium library to create an interactive map with markers for locations provided in a pandas DataFrame. Finally, the map is created and displayed, providing a visual representation of the geographic data." + ], + "metadata": { + "id": "dGRBF5HtBG0R" + } + }, + { + "cell_type": "code", + "source": [ + "from geopy.geocoders import Nominatim\n", + "from geopy.exc import GeocoderTimedOut, GeocoderServiceError\n", + "import pandas as pd\n", + "import time\n", + "from typing import List, Tuple, Optional\n", + "import random\n", + "\n", + "class GeocodingService:\n", + " def __init__(self, user_agent: str = None, timeout: int = 10, rate_limit: float = 1.1):\n", + " \"\"\"\n", + " Initialize the geocoding service with proper configuration.\n", + "\n", + " Args:\n", + " user_agent: Custom user agent string (default: generated)\n", + " timeout: Timeout for requests in seconds\n", + " rate_limit: Time to wait between requests in seconds\n", + " \"\"\"\n", + " if user_agent is None:\n", + " user_agent = f\"python_geocoding_script_{random.randint(1000, 9999)}\"\n", + "\n", + " self.geolocator = Nominatim(\n", + " user_agent=user_agent,\n", + " timeout=timeout\n", + " )\n", + " self.rate_limit = rate_limit\n", + " self.last_request = 0\n", + "\n", + " def _rate_limit_wait(self):\n", + " \"\"\"Implement rate limiting between requests\"\"\"\n", + " current_time = time.time()\n", + " time_since_last = current_time - self.last_request\n", + " if time_since_last < self.rate_limit:\n", + " time.sleep(self.rate_limit - time_since_last)\n", + " self.last_request = time.time()\n", + "\n", + " def geocode_location(self, location: str, max_retries: int = 3) -> Optional[Tuple[float, float]]:\n", + " \"\"\"\n", + " Geocode a single location with retries.\n", + "\n", + " Args:\n", + " location: Location string to geocode\n", + " max_retries: Maximum number of retry attempts\n", + "\n", + " Returns:\n", + " Tuple of (latitude, longitude) or None if geocoding fails\n", + " \"\"\"\n", + " for attempt in range(max_retries):\n", + " try:\n", + " self._rate_limit_wait()\n", + " location_data = self.geolocator.geocode(location)\n", + " if location_data:\n", + " return (location_data.latitude, location_data.longitude)\n", + " return None\n", + " except (GeocoderTimedOut, GeocoderServiceError) as e:\n", + " if attempt == max_retries - 1:\n", + " print(f\"Failed to geocode '{location}' after {max_retries} attempts: {e}\")\n", + " return None\n", + " time.sleep(2 ** attempt) # Exponential backoff\n", + " except Exception as e:\n", + " print(f\"Error geocoding '{location}': {e}\")\n", + " return None\n", + " return None\n", + "\n", + " def process_locations(self, locations: str) -> List[Optional[Tuple[float, float]]]:\n", + " \"\"\"\n", + " Process a comma-separated string of locations.\n", + "\n", + " Args:\n", + " locations: Comma-separated string of location names\n", + "\n", + " Returns:\n", + " List of coordinate tuples or None for failed geocoding\n", + " \"\"\"\n", + " if pd.isna(locations) or not locations:\n", + " return []\n", + "\n", + " location_list = [loc.strip() for loc in locations.split(',')]\n", + " return [self.geocode_location(loc) for loc in location_list]\n", + "\n", + "def geolocate_places(df: pd.DataFrame,\n", + " places_column: str = 'places',\n", + " user_agent: str = None) -> pd.DataFrame:\n", + " \"\"\"\n", + " Add coordinates to a DataFrame based on location names.\n", + "\n", + " Args:\n", + " df: Input DataFrame\n", + " places_column: Name of the column containing comma-separated location strings\n", + " user_agent: Custom user agent string\n", + "\n", + " Returns:\n", + " DataFrame with added 'coordinates' column\n", + " \"\"\"\n", + " geocoder = GeocodingService(user_agent=user_agent)\n", + "\n", + " # Create a copy to avoid modifying the original DataFrame\n", + " result_df = df.copy()\n", + "\n", + " # Process locations\n", + " result_df['coordinates'] = result_df[places_column].apply(geocoder.process_locations)\n", + "\n", + " return result_df\n", + "\n", + "# Main execution\n", + "if __name__ == \"__main__\":\n", + " # Assuming articles_df is your DataFrame with a 'places' column\n", + " # Apply geocoding to the articles DataFrame\n", + " articles_df_with_coords = geolocate_places(\n", + " articles_df,\n", + " places_column='places',\n", + " user_agent='article_geocoding_service_v1.0'\n", + " )\n", + "\n", + " # Update the original DataFrame with the new coordinates\n", + " articles_df['coordinates'] = articles_df_with_coords['coordinates']\n", + "\n", + " # Display the results\n", + " print(\"\\nSample of geocoded locations:\")\n", + " print(articles_df[['places', 'coordinates']].head())\n", + "\n", + " # Optional: Display some statistics\n", + " total_locations = len(articles_df)\n", + " successful_geocodes = articles_df['coordinates'].apply(lambda x: len([c for c in x if c is not None])).sum()\n", + " failed_geocodes = articles_df['coordinates'].apply(lambda x: len([c for c in x if c is None])).sum()\n", + "\n", + " print(f\"\\nGeocoding Statistics:\")\n", + " print(f\"Total locations processed: {total_locations}\")\n", + " print(f\"Successfully geocoded: {successful_geocodes}\")\n", + " print(f\"Failed to geocode: {failed_geocodes}\")" + ], + "metadata": { + "id": "_E_H2q1HevoY" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import folium\n", + "from folium import plugins\n", + "import pandas as pd\n", + "from typing import List, Tuple, Optional\n", + "from IPython.display import display\n", + "\n", + "def create_location_map(df: pd.DataFrame,\n", + " coordinates_col: str = 'coordinates',\n", + " places_col: str = 'places',\n", + " title_col: Optional[str] = None) -> folium.Map:\n", + " \"\"\"\n", + " Create an interactive map with markers for all locations in the DataFrame.\n", + "\n", + " Args:\n", + " df: DataFrame containing coordinates and place names\n", + " coordinates_col: Name of column containing coordinates\n", + " places_col: Name of column containing place names\n", + " title_col: Optional column name for additional marker information\n", + "\n", + " Returns:\n", + " folium.Map object with all locations marked\n", + " \"\"\"\n", + " # Initialize the map\n", + " m = folium.Map(location=[0, 0], zoom_start=2)\n", + "\n", + " # Create a MarkerCluster\n", + " marker_cluster = plugins.MarkerCluster().add_to(m)\n", + "\n", + " # Keep track of all valid coordinates for setting bounds\n", + " all_coords = []\n", + "\n", + " # Process each row in the DataFrame\n", + " for idx, row in df.iterrows():\n", + " coordinates = row[coordinates_col]\n", + " places = row[places_col].split(',') if pd.notna(row[places_col]) else []\n", + " title = row[title_col] if title_col and pd.notna(row[title_col]) else None\n", + "\n", + " # Skip if no coordinates\n", + " if not coordinates:\n", + " continue\n", + "\n", + " # Add markers for each location\n", + " for i, (coord, place) in enumerate(zip(coordinates, places)):\n", + " if coord is not None: # Skip None coordinates\n", + " lat, lon = coord\n", + " place_name = place.strip()\n", + "\n", + " # Create popup content\n", + " popup_content = f\"{place_name}\"\n", + " if title:\n", + " popup_content += f\"
{title}\"\n", + "\n", + " # Add marker\n", + " folium.Marker(\n", + " location=[lat, lon],\n", + " popup=popup_content,\n", + " tooltip=place_name\n", + " ).add_to(marker_cluster)\n", + "\n", + " all_coords.append([lat, lon])\n", + "\n", + " # If we have coordinates, fit the map bounds to include all points\n", + " if all_coords:\n", + " m.fit_bounds(all_coords)\n", + "\n", + " return m\n", + "\n", + "# Create and display the map\n", + "map_obj = create_location_map(articles_df)\n", + "display(map_obj)" + ], + "metadata": { + "id": "Lp0cBMtAhmEF" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file