Pubmed Figure Scraping

Unfortunately, I cannot release the pubmed image dataset directly, as I am unsure of copyright. However below is the script for downloading the images directly from pubmed, sorting, and quality controlling the data.

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
In [2]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
In [3]:
import tensorflow as tf
In [4]:
tf.__version__
Out[4]:
'2.5.0'
In [5]:
!pip install imageio
Requirement already satisfied: imageio in /usr/local/lib/python3.7/dist-packages (2.4.1)
Requirement already satisfied: pillow in /usr/local/lib/python3.7/dist-packages (from imageio) (7.1.2)
Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from imageio) (1.19.5)
In [ ]:
!pip install wget 
In [7]:
#Imports for image manipulation
import glob
import imageio
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import urllib.request
import os
import PIL
from tensorflow.keras import layers
import time
import shutil
import wget
from IPython import display
In [ ]:
from google.colab import drive
drive.mount('/content/drive') #save to drive

Load and prepare the dataset

Below code downloads and sorts the images

In [27]:
dataDF = pd.read_csv('ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.csv')
In [23]:
os.chdir('/content/')
os.listdir()
Out[23]:
['.config', 'drive', 'PMC1173111.tar.gz', 'sample_data']
In [10]:
dataDF = dataDF[dataDF['Article Citation'].str.contains("Neur")==True] #selects neuroscience articles, any tag can be chosen 
In [11]:
print(dataDF)
                                     File  ...     License
176      oa_package/fa/a7/PMC29099.tar.gz  ...  NO-CC CODE
177      oa_package/d0/51/PMC29100.tar.gz  ...  NO-CC CODE
178      oa_package/76/21/PMC29101.tar.gz  ...  NO-CC CODE
179      oa_package/f1/ec/PMC29102.tar.gz  ...  NO-CC CODE
180      oa_package/a8/cf/PMC29103.tar.gz  ...  NO-CC CODE
...                                   ...  ...         ...
19422  oa_package/43/7e/PMC1236932.tar.gz  ...       CC BY
19423  oa_package/75/ff/PMC1236933.tar.gz  ...       CC BY
19445  oa_package/84/dd/PMC1236955.tar.gz  ...       CC BY
19500  oa_package/34/81/PMC1239918.tar.gz  ...       CC BY
19501  oa_package/b9/86/PMC1239919.tar.gz  ...       CC BY

[343 rows x 6 columns]
In [24]:
url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/'
randsamp = dataDF.sample(1000, axis=0) #sample N number data points
print(randsamp)
##Creat a data directory first or else you will get an error
file_list = np.asarray(os.listdir('/content/drive/My Drive/data/'))
for i, x in enumerate(randsamp["File"]):
    id = x.split('/')[-1].split('.')[0] #Grab the file name
    
    if id in file_list:
      print('null')
    else:
      name, _ = urllib.request.urlretrieve(url + x, "/content/" + id + ".tar.gz") #grab the file from ftp
      year = randsamp['Article Citation'].iloc[i].split('.')[-1].split(';')[0] #Grab the year for labeling purposes
      shutil.unpack_archive(x.split('/')[-1], '/tmp/') #unpack the tarball
      if i % 100 == 0:
        print('Number ' + str(i))
      os.remove(x.split('/')[-1])
      for r, n , files in os.walk('/tmp/'): #Walk thru the folder
        for f in files:
          file_path = os.path.join(r, f)
          if '.jpg' in f: #if its a jpg
            shutil.copy(file_path,'/content/drive/My Drive/data/' + str(id) + '_'  + year + '_' + f) #copy to the folder
            os.remove(file_path)
          else:
            os.remove(file_path)
      os.removedirs('/tmp/'+ id)
                                     File  ...     License
19223  oa_package/7d/65/PMC1215486.tar.gz  ...       CC BY
17947  oa_package/e0/ba/PMC1180450.tar.gz  ...       CC BY
3844    oa_package/6f/6a/PMC169170.tar.gz  ...  NO-CC CODE
7072    oa_package/89/71/PMC483061.tar.gz  ...  NO-CC CODE
8174    oa_package/7f/93/PMC526203.tar.gz  ...       CC BY
7142    oa_package/ba/ba/PMC500895.tar.gz  ...  NO-CC CODE
1549    oa_package/6c/70/PMC122075.tar.gz  ...  NO-CC CODE
6738    oa_package/96/d2/PMC449712.tar.gz  ...  NO-CC CODE
6482    oa_package/60/5c/PMC428574.tar.gz  ...  NO-CC CODE
18277  oa_package/52/95/PMC1183241.tar.gz  ...       CC BY

[10 rows x 6 columns]
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:9: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if __name__ == '__main__':
Number 0
In [26]:
imagefold = '/content/drive/My Drive/data/' #Set to where you saved the images
for r, s,f in os.walk(imagefold): #Walk thru the image folder
  print(len(f))
  for file in f:
    if '.jpg' in file:
      img_p = os.path.join(r,file)
      fn = img_p
      
      image_contents = tf.io.read_file(fn)
      try:
        image = tf.image.decode_jpeg(image_contents, channels=1) #look for errors in dataset loading
      except:
        print('removing' + img_p) #If it fails to open for whatever reason, it is deleted. Saves a headache later.
        os.remove(img_p)
58
In [ ]:
shutil.make_archive('data', 'zip', '/data/')
Out[ ]:
'/content/data.zip'

Loading the dataset

In [ ]:
imagefold = '/content/drive/My Drive/data/'
list_ds = tf.data.Dataset.list_files(str(imagefold+'*.jpg'))
from PIL import Image
def image_pro(path):

      tmp_img = tf.io.read_file(path)
      try:
        tmp_img = tf.io.decode_jpeg(tmp_img, 3)
      except:
        tmp_img = tf.io.read_file(str(''))
        tmp_img = tf.io.decode_jpeg(tmp_img, 3)
      label = tf.strings.split(path, ' ')[2]
      tmp_img = tf.image.resize(tmp_img, (128, 128)) #Resize to desired format
      tmp_img = tf.image.convert_image_dtype(tmp_img, tf.float32) 
      return tmp_img, int(label)
labeled_ds = list_ds.map(image_pro)