Skip to content
Closed
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ authors:
repository-code: 'https://github.com/fastdatascience/drug_named_entity_recognition'
url: 'https://fastdatascience.com/drug-named-entity-recognition-python-library/'
license: MIT
version: 2.0.9
version: 2.0.11
date-released: '2024-10-04'
url: 'https://zenodo.org/doi/10.5281/zenodo.10970631'
doi: 10.5281/zenodo.10970631
doi: 10.5281/zenodo.10970631
16 changes: 14 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,18 @@ To the extent possible under law, the person who associated CC0 with the DrugBan

If you'd like to contribute to this project, you can contact us at https://fastdatascience.com/ or make a pull request on our [Github repository](https://github.com/fastdatascience/drug_named_entity_recognition). You can also [raise an issue](https://github.com/fastdatascience/drug_named_entity_recognition/issues).

## Future Improvements

### Data Storage Format Migration

Currently, the drug dictionary data and FuzzySet data structures are stored using Python's `pickle` format. Future work includes:

- **Migrate drug dictionary storage from pickle to JSON**: The drug dictionary data (`drug_variant_to_canonical`, `drug_canonical_to_data`, `drug_variant_to_variant_data`) should be stored in a standard JSON format instead of pickle for better portability, version control compatibility, and security.

- **Add JSON serialization support for FuzzySet**: The FuzzySet data structures (used for fuzzy matching) should be serializable to JSON format. This would allow pre-building FuzzySets during data preparation (`harvesting_data_from_source/07_combine_data_sources.py`) and loading them directly in `drugs_finder.py`, eliminating the need to rebuild them on every import and improving startup performance.

These improvements would make the data format more transparent, easier to inspect, and compatible with a wider range of tools and workflows.

## Developing the Drug Named Entity Recognition library

### Automated tests
Expand Down Expand Up @@ -405,12 +417,12 @@ MIT License. Copyright (c) 2023 [Fast Data Science](https://fastdatascience.com)

## ✍️ Citing the Drug Named Entity Recognition library

Wood, T.A., Drug Named Entity Recognition [Computer software], Version 2.0.9, accessed at [https://fastdatascience.com/drug-named-entity-recognition-python-library](https://fastdatascience.com/drug-named-entity-recognition-python-library), Fast Data Science Ltd (2024)
Wood, T.A., Drug Named Entity Recognition [Computer software], Version 2.0.11, accessed at [https://fastdatascience.com/drug-named-entity-recognition-python-library](https://fastdatascience.com/drug-named-entity-recognition-python-library), Fast Data Science Ltd (2024)

```
@unpublished{drugnamedentityrecognition,
AUTHOR = {Wood, T.A.},
TITLE = {Drug Named Entity Recognition (Computer software), Version 2.0.9},
TITLE = {Drug Named Entity Recognition (Computer software), Version 2.0.11},
YEAR = {2024},
Note = {To appear},
url = {https://zenodo.org/doi/10.5281/zenodo.10970631},
Expand Down
26 changes: 10 additions & 16 deletions harvesting_data_from_source/01_drugbank_download_vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@

import os
import re
import subprocess
from sys import platform

import requests

Expand All @@ -40,23 +38,19 @@

url = re_url.findall(response.text)[0]

print(f"Platform is {platform}.")
if "win" in platform: # if we are on Windows, use curl.exe (supported in Windows 10 and up)
tmpfile = "C:/temp/tmp.zip"
wget = subprocess.Popen(["curl.exe", "--output", tmpfile, "--url", url])
else:
tmpfile = "/tmp/tmp.zip"
wget = subprocess.Popen(["wget", "-O", tmpfile, url])
tmpfile = "/tmp/tmp.zip"
print(f"Downloading Drugbank dump from {url} to {tmpfile}...")
response = requests.get(url)
response.raise_for_status() # Raise an exception for bad status codes

os.waitpid(wget.pid, 0)
with open(tmpfile, 'wb') as f:
f.write(response.content)

print(f"Downloaded Drugbank dump from {url} to {tmpfile}.")

if "win" in platform: # if we are on Windows, use curl.exe (supported in Windows 10 and up)
unzip = subprocess.Popen(["unzip", -"o", tmpfile, "-d", "."])
else:
unzip = subprocess.Popen(["unzip", "-o", tmpfile, "-d", "."])

os.waitpid(unzip.pid, 0)
import zipfile
print(f"Unzipping Drugbank dump from {tmpfile} to current directory...")
with zipfile.ZipFile(tmpfile, 'r') as zip_ref:
zip_ref.extractall(".")

print(f"Unzipped Drugbank dump from {tmpfile} to current directory.")
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,8 @@
import csv
import datetime
import os
import subprocess
import requests
import xml.sax
from sys import platform

# Example URL of MeSH dump: https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2023.xml

Expand All @@ -47,13 +46,12 @@
print(
f"Downloading MeSH XML dump from {url}. If this URL doesn't work, please navigate to https://www.nlm.nih.gov/ and search the site for a MeSH data dump in XML format.")

print(f"Platform is {platform}.")
if "win" in platform: # if we are on Windows, use curl.exe (supported in Windows 10 and up)
wget = subprocess.Popen(["curl.exe", "--output", mesh_xml_file_name, "--url", url])
else:
wget = subprocess.Popen(["wget", url])
print("Downloading MeSH XML dump...")
response = requests.get(url)
response.raise_for_status() # Raise an exception for bad status codes

os.waitpid(wget.pid, 0)
with open(mesh_xml_file_name, 'wb') as f:
f.write(response.content)

print(f"Downloaded MeSH XML dump from {url}.")

Expand Down
61 changes: 31 additions & 30 deletions harvesting_data_from_source/05_download_smiles_from_pubchem.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,61 +31,62 @@
# Note that we have to get two files: one is Pubchem ID to SMILES, and one is Pubchem ID to MeSH name (not MeSH ID)
# We can later join these to get MeSH name to SMILES

import gzip
import os
import subprocess
from sys import platform
import requests
import shutil

url_pubchem_mesh = "https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/CID-MeSH"
output_file_pubchem_mesh = "CID-MeSH"

print(f"Platform is {platform}.")
if "win" in platform: # if we are on Windows, use curl.exe (supported in Windows 10 and up)
wget = subprocess.Popen(["curl.exe", "--output", output_file_pubchem_mesh, "--url", url_pubchem_mesh])
else:
wget = subprocess.Popen(["wget", "-O", output_file_pubchem_mesh, url_pubchem_mesh])
# Download MeSH file
print(f"Downloading Pubchem MeSH dump for SMILES from {url_pubchem_mesh} to {output_file_pubchem_mesh}...")
response = requests.get(url_pubchem_mesh)
response.raise_for_status()

os.waitpid(wget.pid, 0)
with open(output_file_pubchem_mesh, 'wb') as f:
f.write(response.content)

print(f"Downloaded Pubchem MeSH dump for SMILES from {url_pubchem_mesh} to {output_file_pubchem_mesh}.")

# Download SMILES file
url_pubchem_smiles = "https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/CID-SMILES.gz"
output_file_pubchem_smiles = "CID-SMILES.gz"

print(f"Platform is {platform}.")
if "win" in platform: # if we are on Windows, use curl.exe (supported in Windows 10 and up)
wget = subprocess.Popen(["curl.exe", "--output", output_file_pubchem_smiles, "--url", url_pubchem_smiles])
else:
wget = subprocess.Popen(["wget", "-O", output_file_pubchem_smiles, url_pubchem_smiles])
print(f"Downloading Pubchem SMILES data from {url_pubchem_smiles} to {output_file_pubchem_smiles}...")
response = requests.get(url_pubchem_smiles)
response.raise_for_status()

os.waitpid(wget.pid, 0)
with open(output_file_pubchem_smiles, 'wb') as f:
f.write(response.content)

print(f"Downloaded Pubchem SMILES data from {url_pubchem_smiles} to {output_file_pubchem_smiles}.")

# Download mass file
url_pubchem_mass = "https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/CID-Mass.gz"
output_file_pubchem_mass = "CID-Mass.gz"

print(f"Platform is {platform}.")
if "win" in platform: # if we are on Windows, use curl.exe (supported in Windows 10 and up)
wget = subprocess.Popen(["curl.exe", "--output", output_file_pubchem_mass, "--url", url_pubchem_mass])
else:
wget = subprocess.Popen(["wget", "-O", output_file_pubchem_mass, url_pubchem_mass])
print(f"Downloading Pubchem mass data from {url_pubchem_mass} to {output_file_pubchem_mass}...")
response = requests.get(url_pubchem_mass)
response.raise_for_status()

os.waitpid(wget.pid, 0)
with open(output_file_pubchem_mass, 'wb') as f:
f.write(response.content)

print(f"Downloaded Pubchem mass data from {url_pubchem_mass} to {output_file_pubchem_mass}.")

print(f"Unzipping {output_file_pubchem_smiles}.")

unzip = subprocess.Popen(["gunzip", "-f", output_file_pubchem_smiles])

os.waitpid(unzip.pid, 0)
# Unzip SMILES file
print(f"Unzipping {output_file_pubchem_smiles}...")
with gzip.open(output_file_pubchem_smiles, 'rb') as f_in:
with open(output_file_pubchem_smiles[:-3], 'wb') as f_out: # Remove .gz extension
shutil.copyfileobj(f_in, f_out)

print(f"Unzipped {output_file_pubchem_smiles}.")

print(f"Unzipping {output_file_pubchem_mass}.")

unzip = subprocess.Popen(["gunzip", "-f", output_file_pubchem_mass])

os.waitpid(unzip.pid, 0)
# Unzip mass file
print(f"Unzipping {output_file_pubchem_mass}...")
with gzip.open(output_file_pubchem_mass, 'rb') as f_in:
with open(output_file_pubchem_mass[:-3], 'wb') as f_out: # Remove .gz extension
shutil.copyfileobj(f_in, f_out)

print(f"Unzipped {output_file_pubchem_mass}.")
Loading
Loading