gregory icon indicating copy to clipboard operation
gregory copied to clipboard

pubmed articles return a truncated abstract

Open brunoamaral opened this issue 8 months ago • 0 comments

rss feed where the problem was found: https://pubmed.ncbi.nlm.nih.gov/rss/search/10guX6I3SqrbUeeLKSTD6FCRM44ewnrN2MKKTQLLPMHB4xNsZU/?limit=15&utm_campaign=pubmed-2&fc=20210216052009

this is in the feedreader.py file that processes the data

		def update_articles_from_feeds(self):
			sources = Sources.objects.filter(method='rss', source_for='science paper')
			for source in sources:
					feed = self.fetch_feed(source.link, source.ignore_ssl)
					for entry in feed['entries']:
							title = entry['title']
							self.stdout.write(f"Processing {title}")
							summary = entry.get('summary', '')
							if hasattr(entry, 'summary_detail'):
									summary = entry['summary_detail']['value']
							published = entry.get('published')
							if 'pubmed' in source.link and hasattr(entry, 'content'):
									summary = entry['content'][0]['value']
							published_date = parse(entry.get('published') or entry.get('prism_coverdate'), tzinfos=self.tzinfos).astimezone(pytz.utc)
							link = greg.remove_utm(entry['link'])
							doi = None
							if 'pubmed' in source.link and entry.get('dc_identifier', '').startswith('doi:'):
									doi = entry['dc_identifier'].replace('doi:', '')
							elif 'faseb' in source.link:
									doi = entry.get('prism_doi', '')

							if doi:
									crossref_paper = SciencePaper(doi=doi)
									crossref_paper.refresh()
									title = crossref_paper.title if crossref_paper.title else entry['title']
									summary = crossref_paper.abstract if crossref_paper.abstract else entry.get('summary')

									# Check if an article with the same DOI or title exists
									existing_article = Articles.objects.filter(Q(doi=doi) | Q(title=title)).first()
									if existing_article:
										science_paper = existing_article
										created = False
									else:
										science_paper = Articles.objects.create(
											doi=doi,
											title=title,
											summary=summary,
											link=link,
											published_date=published_date,
											container_title=crossref_paper.journal,
											publisher=crossref_paper.publisher,
											access=crossref_paper.access,
											crossref_check=timezone.now()
										)
										created = True

									if created:
										science_paper.teams.add(source.team)
										science_paper.subjects.add(source.subject)
										science_paper.sources.add(source)
										science_paper.save()
									else:
											if any([science_paper.title != title, science_paper.summary != SciencePaper.clean_abstract(abstract=summary),
													science_paper.link != link, science_paper.published_date != published_date]):
													science_paper.title = title
													science_paper.summary = SciencePaper.clean_abstract(abstract=summary)
													science_paper.link = link
													science_paper.published_date = published_date
													science_paper.sources.add(source)
													science_paper.teams.add(source.team)
													science_paper.subjects.add(source.subject)
													science_paper.save()

									# Process author information
									if crossref_paper is not None:  # Assuming `paper` contains the article's metadata including author information
										if crossref_paper.authors is not None:
											for author_info in crossref_paper.authors:
												given_name = author_info.get('given')
												family_name = author_info.get('family')
												orcid = author_info.get('ORCID', None)
												try:
													if orcid:  # If ORCID is present, use it as the primary key for author lookup/creation
														author_obj, author_created = Authors.objects.get_or_create(
																ORCID=orcid,
																defaults={
																		'given_name': given_name,
																		'family_name': family_name
																		}
																)
													else:  # If no ORCID is provided, fallback to using given_name and family_name for lookup/creation
														if not given_name or not family_name:
															self.stdout.write(f"Missing given name or family name, skipping this author. {crossref_paper.doi}")
															continue
														else:
															author_obj, author_created = Authors.objects.get_or_create(
																given_name=given_name,
																family_name=family_name,
																defaults={'ORCID': orcid}  # orcid will be an empty string if not provided, which is fine
															)
												except MultipleObjectsReturned:
													# Handle the case where multiple authors are returned
													authors = Authors.objects.filter(given_name=given_name, family_name=family_name)
													print(f"Multiple authors found for {given_name} {family_name}:")
													for author in authors:
															print(f"Author ID: {author.author_id}, ORCID: {author.ORCID}")
													# Use the first author with an ORCID, if available
													author_obj = next((author for author in authors if author.ORCID), authors.first())

													# Link the author to the article if not already linked
												if not science_paper.authors.filter(pk=author_obj.pk).exists():
													science_paper.authors.add(author_obj)
							else:
								print('no DOI, trying to create article')
								existing_article = Articles.objects.filter(title=title).first()
								if existing_article:
											science_paper = existing_article
											created = False
								else:
											science_paper = Articles.objects.create(
												title=title,
												summary=summary,
												link=link,
												published_date=published_date,
												source=source,
												crossref_check=None
											)
											created = True

								if not created:
									if any([science_paper.title != title, science_paper.summary != SciencePaper.clean_abstract(abstract=summary),
												science_paper.link != link, science_paper.published_date != published_date]):
										science_paper.title = title
										science_paper.summary = SciencePaper.clean_abstract(abstract=summary)
										science_paper.link = link
										science_paper.published_date = published_date
										science_paper.teams.add(source.team)
										science_paper.subjects.add(source.subject)
										science_paper.sources.add(source)
										science_paper.save()

brunoamaral avatar Jun 23 '24 19:06 brunoamaral