From cd9ee9bbb17b9b755f04ea7cf15e3aa08b3cb193 Mon Sep 17 00:00:00 2001 From: Sander Vocke Date: Mon, 23 Nov 2020 15:34:28 +0100 Subject: [PATCH] Add a YouTube Music web scraper. --- client/src/api.ts | 9 +- client/src/assets/youtubemusic_icon.svg | 16 ++ .../src/components/common/StoreLinkIcon.tsx | 10 +- .../windows/settings/IntegrationSettings.tsx | 34 ++- .../windows/song/EditSongDialog.tsx | 2 +- .../src/lib/integration/useIntegrations.tsx | 15 +- .../youtubemusic/YoutubeMusicWebScraper.tsx | 203 ++++++++++++++++++ server/integrations/integrations.ts | 15 ++ 8 files changed, 292 insertions(+), 12 deletions(-) create mode 100644 client/src/assets/youtubemusic_icon.svg create mode 100644 client/src/lib/integration/youtubemusic/YoutubeMusicWebScraper.tsx diff --git a/client/src/api.ts b/client/src/api.ts index 92333f1..c872f29 100644 --- a/client/src/api.ts +++ b/client/src/api.ts @@ -360,18 +360,21 @@ export const LogoutEndpoint = "/logout"; export enum IntegrationType { SpotifyClientCredentials = "SpotifyClientCredentials", + YoutubeWebScraper = "YoutubeWebScraper", } export interface SpotifyClientCredentialsDetails { clientId: string, } - export interface SpotifyClientCredentialsSecretDetails { clientSecret: string, } -export type IntegrationDetails = SpotifyClientCredentialsDetails; -export type IntegrationSecretDetails = SpotifyClientCredentialsSecretDetails; +export interface YoutubeMusicWebScraperDetails {} +export interface YoutubeMusicWebScraperSecretDetails {} + +export type IntegrationDetails = SpotifyClientCredentialsDetails | YoutubeMusicWebScraperDetails; +export type IntegrationSecretDetails = SpotifyClientCredentialsSecretDetails | YoutubeMusicWebScraperSecretDetails; // Create a new integration (POST). export const CreateIntegrationEndpoint = '/integration'; diff --git a/client/src/assets/youtubemusic_icon.svg b/client/src/assets/youtubemusic_icon.svg new file mode 100644 index 0000000..1b3c565 --- /dev/null +++ b/client/src/assets/youtubemusic_icon.svg @@ -0,0 +1,16 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/client/src/components/common/StoreLinkIcon.tsx b/client/src/components/common/StoreLinkIcon.tsx index c35fd75..72f600f 100644 --- a/client/src/components/common/StoreLinkIcon.tsx +++ b/client/src/components/common/StoreLinkIcon.tsx @@ -1,10 +1,12 @@ import React from 'react'; import { ReactComponent as GPMIcon } from '../../assets/googleplaymusic_icon.svg'; import { ReactComponent as SpotifyIcon } from '../../assets/spotify_icon.svg'; +import { ReactComponent as YoutubeMusicIcon } from '../../assets/youtubemusic_icon.svg'; export enum ExternalStore { GooglePlayMusic = "Google Play Music", Spotify = "Spotify", + YoutubeMusic = "Youtube Music", } export interface IProps { @@ -16,6 +18,8 @@ export function whichStore(url: string) { return ExternalStore.GooglePlayMusic; } else if (url.includes('spotify.com')) { return ExternalStore.Spotify; + } else if (url.includes('music.youtube.com')) { + return ExternalStore.YoutubeMusic; } return undefined; } @@ -28,9 +32,11 @@ export default function StoreLinkIcon(props: any) { switch (whichStore) { case ExternalStore.GooglePlayMusic: - return ; + return ; case ExternalStore.Spotify: - return ; + return ; + case ExternalStore.YoutubeMusic: + return ; default: throw new Error("Unknown external store: " + whichStore) } diff --git a/client/src/components/windows/settings/IntegrationSettings.tsx b/client/src/components/windows/settings/IntegrationSettings.tsx index 928fd15..d8210f3 100644 --- a/client/src/components/windows/settings/IntegrationSettings.tsx +++ b/client/src/components/windows/settings/IntegrationSettings.tsx @@ -84,7 +84,16 @@ function EditIntegration(props: { })} Spotify (using Client Credentials) - + , + [serverApi.IntegrationType.YoutubeWebScraper]: + + + {new IntegrationClasses[serverApi.IntegrationType.YoutubeWebScraper](-1).getIcon({ + style: { height: '40px', width: '40px' } + })} + + Youtube Music (using experimental web scraper) + , } let IntegrationDescription: Record = { [serverApi.IntegrationType.SpotifyClientCredentials]: @@ -95,7 +104,15 @@ function EditIntegration(props: { Please see the Spotify API documentation on how to generate a client ID and client secret. Once set, you will only be able to overwrite the secret here, not read it. - + , + [serverApi.IntegrationType.YoutubeWebScraper]: + + This integration allows using the public Youtube Music search page to scrape + for music metadata.
+ Because it relies on reverse-engineering of a web page that may change in the + future, this is considered to be experimental and unstable. However, the music links acquired + using this method are expected to remain reasonably stable. +
, } return @@ -122,8 +139,9 @@ function EditIntegration(props: { {props.integration.type === serverApi.IntegrationType.SpotifyClientCredentials && Spotify + >Spotify via Client Credentials + { + props.onAdd && props.onAdd(serverApi.IntegrationType.YoutubeWebScraper); + props.onClose && props.onClose(); + }} + >Youtube Music Web Scraper } diff --git a/client/src/components/windows/song/EditSongDialog.tsx b/client/src/components/windows/song/EditSongDialog.tsx index 30b0228..3b8b962 100644 --- a/client/src/components/windows/song/EditSongDialog.tsx +++ b/client/src/components/windows/song/EditSongDialog.tsx @@ -23,7 +23,7 @@ export function ProvideLinksWidget(props: { props.providers.length > 0 ? 0 : undefined ); let [query, setQuery] = useState( - `${props.metadata.title} ${props.metadata.artists && props.metadata.artists[0].name}` + `${props.metadata.title}${props.metadata.artists && ` ${props.metadata.artists[0].name}`}${props.metadata.albums && ` ${props.metadata.albums[0].name}`}` ) let [results, setResults] = useState([]); diff --git a/client/src/lib/integration/useIntegrations.tsx b/client/src/lib/integration/useIntegrations.tsx index 883b772..19e7888 100644 --- a/client/src/lib/integration/useIntegrations.tsx +++ b/client/src/lib/integration/useIntegrations.tsx @@ -5,6 +5,7 @@ import SpotifyClientCreds from "./spotify/SpotifyClientCreds"; import * as backend from "../backend/integrations"; import { handleNotLoggedIn, NotLoggedInError } from "../backend/request"; import { useAuth } from "../useAuth"; +import YoutubeMusicWebScraper from "./youtubemusic/YoutubeMusicWebScraper"; export type IntegrationState = { id: number, @@ -27,6 +28,7 @@ export interface Integrations { export const IntegrationClasses: Record = { [serverApi.IntegrationType.SpotifyClientCredentials]: SpotifyClientCreds, + [serverApi.IntegrationType.YoutubeWebScraper]: YoutubeMusicWebScraper, } export function makeDefaultIntegrationProperties(type: serverApi.IntegrationType): @@ -34,12 +36,20 @@ export function makeDefaultIntegrationProperties(type: serverApi.IntegrationType switch(type) { case serverApi.IntegrationType.SpotifyClientCredentials: { return { - name: "Spotify", + name: "Spotify App", type: type, details: { clientId: "" }, secretDetails: { clientSecret: "" }, } } + case serverApi.IntegrationType.YoutubeWebScraper: { + return { + name: "Youtube Music Web Scraper", + type: type, + details: {}, + secretDetails: {}, + } + } default: { throw new Error("Unimplemented default integration.") } @@ -51,6 +61,9 @@ export function makeIntegration(p: serverApi.CreateIntegrationRequest, id: numbe case serverApi.IntegrationType.SpotifyClientCredentials: { return new SpotifyClientCreds(id); } + case serverApi.IntegrationType.YoutubeWebScraper: { + return new YoutubeMusicWebScraper(id); + } default: { throw new Error("Unimplemented integration type.") } diff --git a/client/src/lib/integration/youtubemusic/YoutubeMusicWebScraper.tsx b/client/src/lib/integration/youtubemusic/YoutubeMusicWebScraper.tsx new file mode 100644 index 0000000..f3ff258 --- /dev/null +++ b/client/src/lib/integration/youtubemusic/YoutubeMusicWebScraper.tsx @@ -0,0 +1,203 @@ +import React from 'react'; +import Integration, { IntegrationFeature, IntegrationAlbum, IntegrationArtist, IntegrationSong } from '../Integration'; +import StoreLinkIcon, { ExternalStore } from '../../../components/common/StoreLinkIcon'; + +enum SearchType { + Song = 'track', + Artist = 'artist', + Album = 'album', +}; + +export function extractInitialData(text: string): any | undefined { + // At the time of writing this, the scraper is trying to capture from the following block: + // + // initialData.push({ + // path: ..., + // params: {"query":"something"}, + // data: "THIS", + // }); + // + // the THIS part. + + // Get the whole line containing the data part. + let pattern = /initialData\.push\({[\n\r\s]*path:.*[\n\r\s]+params:\s*{\s*['"]query['"].*[\n\r\s]+data:\s*['"](.*)['"]\s*[\n\r]/ + let m = text.match(pattern); + let dataline = Array.isArray(m) && m.length >= 2 ? m[1] : undefined; + if (!dataline) { return undefined; } + + // Now parse the data line. + let dataline_clean = dataline.replace(/\\"/g, '"').replace(/\\\\"/g, '\\"') + console.log(dataline); + console.log(dataline_clean); + + let json = JSON.parse(dataline_clean); + return json; +} + +export function parseSongs(initialData: any): IntegrationSong[] { + try { + var songMusicResponsiveListItemRenderers: any[] = []; + + // Scrape for any "Song"-type items. + initialData.contents.sectionListRenderer.contents.forEach((c: any) => { + if (c.musicShelfRenderer) { + c.musicShelfRenderer.contents.forEach((cc: any) => { + if (cc.musicResponsiveListItemRenderer && + cc.musicResponsiveListItemRenderer.flexColumns && + cc.musicResponsiveListItemRenderer.flexColumns[1] + .musicResponsiveListItemFlexColumnRenderer.text.runs[0].text === "Song") { + songMusicResponsiveListItemRenderers.push(cc.musicResponsiveListItemRenderer); + } + }) + } + }) + + return songMusicResponsiveListItemRenderers.map((s: any) => { + let videoId = s.doubleTapCommand.watchEndpoint.videoId; + let columns = s.flexColumns; + + if (columns[1].musicResponsiveListItemFlexColumnRenderer.text.runs[0].text !== "Song") { + throw new Error('song item doesnt match scraper expectation'); + } + let title = columns[0].musicResponsiveListItemFlexColumnRenderer.text.runs[0].text; + + let artists = columns[2].musicResponsiveListItemFlexColumnRenderer.text.runs.filter((run: any) => { + return 'navigationEndpoint' in run; + }).map((run: any) => { + let id = run.navigationEndpoint.browseEndpoint.browseId; + return { + url: `https://music.youtube.com/browse/${id}`, + name: run.text, + } + }); + + let albums = columns[3].musicResponsiveListItemFlexColumnRenderer.text.runs.filter((run: any) => { + return 'navigationEndpoint' in run; + }).map((run: any) => { + let id = run.navigationEndpoint.browseEndpoint.browseId; + return { + url: `https://music.youtube.com/browse/${id}`, + name: run.text, + artist: artists[0], + } + }); + + return { + title: title, + url: `https://music.youtube.com/watch?v=${videoId}`, + artist: artists[0], + album: albums[0], + } + }) + } catch (e) { + console.log("Error parsing songs:", e.message); + return []; + } +} + +export default class YoutubeMusicWebScraper extends Integration { + integrationId: number; + + constructor(integrationId: number) { + super(integrationId); + this.integrationId = integrationId; + } + + getFeatures(): IntegrationFeature[] { + return [ + IntegrationFeature.Test, + IntegrationFeature.SearchSong, + IntegrationFeature.SearchAlbum, + IntegrationFeature.SearchArtist, + ] + } + + getIcon(props: any) { + return + } + + providesStoreLink() { + return ExternalStore.YoutubeMusic; + } + + async test(testParams: {}) { + const response = await fetch( + (process.env.REACT_APP_BACKEND || "") + + `/integrations/${this.integrationId}/search?q=${encodeURIComponent('No One Knows Queens Of The Stone Age')}`); + + let text = await response.text(); + let songs = parseSongs(extractInitialData(text)); + + console.log("Found songs", songs); + + if (!Array.isArray(songs) || songs.length === 0 || songs[0].title !== "No One Knows") { + throw new Error("Test failed; No One Knows was not correctly identified."); + } + } + + async searchSong(query: string, limit: number): Promise { + const response = await fetch( + (process.env.REACT_APP_BACKEND || "") + + `/integrations/${this.integrationId}/search?q=${encodeURIComponent(query)}`); + + let text = await response.text(); + return parseSongs(extractInitialData(text)); + } + async searchAlbum(query: string, limit: number): Promise { return []; } + async searchArtist(query: string, limit: number): Promise { return []; } + + async search(query: string, type: SearchType, limit: number): + Promise { + + return []; + // const response = await fetch( + // (process.env.REACT_APP_BACKEND || "") + + // `/integrations/${this.integrationId}/v1/search?q=${encodeURIComponent(query)}&type=${type}&limit=${limit}`); + + // if (!response.ok) { + // throw new Error("Spotify Client Credentials search failed: " + JSON.stringify(response)); + // } + + // let json = await response.json(); + + // console.log("Response:", json); + + // switch(type) { + // case SearchType.Song: { + // return json.tracks.items.map((r: any): IntegrationSong => { + // return { + // title: r.name, + // url: r.external_urls.spotify, + // artist: { + // name: r.artists && r.artists[0].name, + // url: r.artists && r.artists[0].external_urls.spotify, + // }, + // album: { + // name: r.album && r.album.name, + // url: r.album && r.album.external_urls.spotify, + // } + // } + // }) + // } + // case SearchType.Artist: { + // return json.artists.items.map((r: any): IntegrationArtist => { + // return { + // name: r.name, + // url: r.external_urls.spotify, + // } + // }) + // } + // case SearchType.Album: { + // return json.albums.items.map((r: any): IntegrationAlbum => { + // return { + // name: r.name, + // url: r.external_urls.spotify, + // artist: { + // name: r.artists[0].name, + // url: r.artists[0].external_urls.spotify, + // }, + // } + // }) + // } + } +} \ No newline at end of file diff --git a/server/integrations/integrations.ts b/server/integrations/integrations.ts index 4b109d6..93ef2d3 100644 --- a/server/integrations/integrations.ts +++ b/server/integrations/integrations.ts @@ -47,6 +47,17 @@ export function createIntegrations(knex: Knex) { } }); + let proxyYoutubeMusic = createProxyMiddleware({ + target: 'https://music.youtube.com/', + changeOrigin: true, + logLevel: 'debug', + pathRewrite: (path: string, req: any) => { + // Remove e.g. "/integrations/5" + console.log("Rewrite URL:", path); + return path.replace(/^\/integrations\/[0-9]+/, ''); + } + }) + // In the first layer, retrieve integration details and save details // in the request. return async (req: any, res: any, next: any) => { @@ -82,6 +93,10 @@ export function createIntegrations(knex: Knex) { req.headers["Authorization"] = "Bearer " + req._access_token; return proxySpotifyCC(req, res, next); } + case IntegrationType.YoutubeWebScraper: { + console.log("Integration: ", req._integration) + return proxyYoutubeMusic(req, res, next); + } default: { res.status(500).send({ reason: "Unsupported integration type " + req._integration.type }) }