Skip to content
Snippets Groups Projects
Commit 7f6d2c65 authored by kjk's avatar kjk
Browse files

added parallel download

parent d568d9f0
No related branches found
No related tags found
No related merge requests found
USDB.xlsx
### MicrosoftOffice template
*.tmp
# Word temporary
~$*.doc*
# Word Auto Backup File
Backup of *.doc*
# Excel temporary
~$*.xls*
# Excel Backup File
*.xlk
# PowerPoint temporary
~$*.ppt*
# Visio autosave temporary files
*.~vsd*
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
USDB.xlsx 0 → 100644
File added
import requests
from pathlib import Path
class CoverDownloader:
def __init__(self, outdir: str):
self.outdir = outdir
def download(self, url: str, artist: str, song: str, spotify_uri: str):
Path(self.outdir).mkdir(parents=True, exist_ok=True)
with open(Path(self.outdir) / f"{artist} - {song} - {spotify_uri}.jpg", "wb") as f:
f.write(requests.get(url).content)
\ No newline at end of file
db.py 0 → 100644
import pandas as pd
import pandas.core.series
def load(file: str):
with open(file, 'rb') as f:
return pd.read_excel(f)
def get_usdb_url(song: pandas.Series):
return song["TXT Link"]
def get_cover_image_url(song: pandas.Series):
return song["Cover Link"]
def get_yt_video_url(song: pandas.Series):
return song["Video Link"]
def get_artist_name(song: pandas.Series):
return song["Artist Name"]
def get_song_name(song: pandas.Series):
return song["Track Name"]
def get_track_id(song: pandas.Series):
return song["Spotify URI"]
class USDB:
def __init__(self, file: str):
self.data = load(file)
import requests import concurrent
from bs4 import BeautifulSoup
import pandas
class USDBDownloader: from txtdownloader import TXTDownloader
def __init__(self, sessid): from ytdownloader import YTDownloader
self.sessid = sessid from coverdownloader import CoverDownloader
from db import *
def download(self, url: str): db = USDB("USDB.xlsx")
url = url.replace("detail", "gettxt") txt = TXTDownloader("nv0rb8ma82p37qrvduvch6j3f6", "./out/txt/")
print(url) yt = YTDownloader("./out/audio/", "./out/video")
cv = CoverDownloader("./out/covers/")
with requests.Session() as s: def download_song(song: pandas.Series):
page = s.post(url, data={'wd': 1}, cookies={'PHPSESSID': self.sessid}) usdb_url = get_usdb_url(song)
cover_url = get_cover_image_url(song)
yt_url = get_yt_video_url(song)
song_name = get_song_name(song).replace("/", "")
artist = get_artist_name(song).replace("/", "")
spotify_uri = get_track_id(song).replace("/", "")
soup = BeautifulSoup(page.content, 'html.parser') if type(usdb_url) == str and usdb_url != " ":
txt = soup.find(name="textarea").text txt.download(usdb_url, artist, song_name, spotify_uri)
return txt
if type(cover_url) == str and cover_url != " ":
cv.download(cover_url, artist, song_name, spotify_uri)
if type(yt_url) == str and yt_url != " ":
yt.download(yt_url, artist, song_name, spotify_uri)
executor = concurrent.futures.ThreadPoolExecutor(10)
futures = [executor.submit(download_song, song) for _, song in db.data.sample(10).iterrows()]
concurrent.futures.wait(futures)
if __name__ == '__main__':
usdbdl = USDBDownloader("")
usdbdl.download("http://usdb.animux.de/?link=detail&id=16196")
...@@ -5,3 +5,4 @@ idna==3.3 ...@@ -5,3 +5,4 @@ idna==3.3
requests==2.28.1 requests==2.28.1
soupsieve==2.3.2.post1 soupsieve==2.3.2.post1
urllib3==1.26.10 urllib3==1.26.10
pandas~=1.4.3
\ No newline at end of file
import urllib.parse
from pathlib import Path
import requests
from bs4 import BeautifulSoup
class TXTDownloader:
def __init__(self, sessid, outdir):
self.sessid = sessid
self.outdir = outdir
def download(self, url: str, artist: str, song: str, spotify_uri: str):
url = url.replace("detail", "gettxt")
with requests.Session() as s:
page = s.post(url, data={'wd': 1}, cookies={'PHPSESSID': self.sessid})
soup = BeautifulSoup(page.content, 'html.parser')
txt = soup.find(name="textarea").text
Path(self.outdir).mkdir(parents=True, exist_ok=True)
with open(Path(self.outdir) / f"{artist} - {song} - {spotify_uri}.txt", "w", encoding="utf-8") as f:
f.write(txt)
import shutil
from pathlib import Path
import yt_dlp.postprocessor
from yt_dlp import YoutubeDL
class YTDownloader:
def __init__(self, audiodir, videodir):
self.audiodir = audiodir
self.videodir = videodir
def download(self, url: str, artist: str, song: str, spotify_uri: str):
ydl_opts = {
'format': 'mp4/best',
'outtmpl': f'{str((Path(self.audiodir) / f"{artist} - {song} - {spotify_uri}").resolve())}.%(ext)s',
# ℹ️ See help(yt_dlp.postprocessor) for a list of available Postprocessors and their arguments
'keepvideo': True,
'postprocessors': [{ # Extract audio using ffmpeg
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
}]
}
with YoutubeDL(ydl_opts) as ydl:
ydl.download(url)
Path(self.videodir).mkdir(parents=True, exist_ok=True)
shutil.move(Path(self.audiodir) / f"{artist} - {song} - {spotify_uri}.mp4", Path(self.videodir) / f"{artist} - {song} - {spotify_uri}.mp4")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment