Unfortunately, I cannot release the pubmed image dataset directly, as I am unsure of copyright. However below is the script for downloading the images directly from pubmed, sorting, and quality controlling the data.
from __future__ import absolute_import, division, print_function, unicode_literals
try:
# %tensorflow_version only exists in Colab.
%tensorflow_version 2.x
except Exception:
pass
import tensorflow as tf
tf.__version__
!pip install imageio
!pip install wget
#Imports for image manipulation
import glob
import imageio
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import urllib.request
import os
import PIL
from tensorflow.keras import layers
import time
import shutil
import wget
from IPython import display
from google.colab import drive
drive.mount('/content/drive') #save to drive
Below code downloads and sorts the images
dataDF = pd.read_csv('ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.csv')
os.chdir('/content/')
os.listdir()
dataDF = dataDF[dataDF['Article Citation'].str.contains("Neur")==True] #selects neuroscience articles, any tag can be chosen
print(dataDF)
url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/'
randsamp = dataDF.sample(1000, axis=0) #sample N number data points
print(randsamp)
##Creat a data directory first or else you will get an error
file_list = np.asarray(os.listdir('/content/drive/My Drive/data/'))
for i, x in enumerate(randsamp["File"]):
id = x.split('/')[-1].split('.')[0] #Grab the file name
if id in file_list:
print('null')
else:
name, _ = urllib.request.urlretrieve(url + x, "/content/" + id + ".tar.gz") #grab the file from ftp
year = randsamp['Article Citation'].iloc[i].split('.')[-1].split(';')[0] #Grab the year for labeling purposes
shutil.unpack_archive(x.split('/')[-1], '/tmp/') #unpack the tarball
if i % 100 == 0:
print('Number ' + str(i))
os.remove(x.split('/')[-1])
for r, n , files in os.walk('/tmp/'): #Walk thru the folder
for f in files:
file_path = os.path.join(r, f)
if '.jpg' in f: #if its a jpg
shutil.copy(file_path,'/content/drive/My Drive/data/' + str(id) + '_' + year + '_' + f) #copy to the folder
os.remove(file_path)
else:
os.remove(file_path)
os.removedirs('/tmp/'+ id)
imagefold = '/content/drive/My Drive/data/' #Set to where you saved the images
for r, s,f in os.walk(imagefold): #Walk thru the image folder
print(len(f))
for file in f:
if '.jpg' in file:
img_p = os.path.join(r,file)
fn = img_p
image_contents = tf.io.read_file(fn)
try:
image = tf.image.decode_jpeg(image_contents, channels=1) #look for errors in dataset loading
except:
print('removing' + img_p) #If it fails to open for whatever reason, it is deleted. Saves a headache later.
os.remove(img_p)
shutil.make_archive('data', 'zip', '/data/')
imagefold = '/content/drive/My Drive/data/'
list_ds = tf.data.Dataset.list_files(str(imagefold+'*.jpg'))
from PIL import Image
def image_pro(path):
tmp_img = tf.io.read_file(path)
try:
tmp_img = tf.io.decode_jpeg(tmp_img, 3)
except:
tmp_img = tf.io.read_file(str(''))
tmp_img = tf.io.decode_jpeg(tmp_img, 3)
label = tf.strings.split(path, ' ')[2]
tmp_img = tf.image.resize(tmp_img, (128, 128)) #Resize to desired format
tmp_img = tf.image.convert_image_dtype(tmp_img, tf.float32)
return tmp_img, int(label)
labeled_ds = list_ds.map(image_pro)