From 7719b1a2a3101750f10544d5744f25848a2f02bd Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Thu, 9 Jan 2020 21:28:57 -0600 Subject: [PATCH] move unsupported domains to a file, add a bunch from bridgy --- data/unsupported-domains.txt | 1070 ++++++++++++++++++++++++++++++++++ lib/Telegraph/Webmention.php | 21 +- 2 files changed, 1077 insertions(+), 14 deletions(-) create mode 100644 data/unsupported-domains.txt diff --git a/data/unsupported-domains.txt b/data/unsupported-domains.txt new file mode 100644 index 0000000..8007fa9 --- /dev/null +++ b/data/unsupported-domains.txt @@ -0,0 +1,1070 @@ +twitter.com +instagram.com +facebook.com +meetup.com +eventbrite.com +eventbrite.co.uk +github.com +blog.github.com +gitlab.com +t.co + +# copy some from bridgy +# https://github.com/snarfed/bridgy/blob/master/domain_blocklist.txt + +twitpic.com +feeds.wordpress.com +stats.wordpress.com +scoop.it +24sessions.com +500px.com +abakus-internet-marketing.de +about.me +ador.com +amazon.com +amazon.in +amzn.com +app.net +android.com +appbrain.com +ask.fm +asdf.com +audioscrobbler.com +awriterz.org +backtype.com +battlefield.com +behance.net +belong.io +bit.ly +bitbucket.org +blogger.com +brightkite.com +cdninstagram.com +change.org +claimid.com +cliqset.com +codepen.io +codeschool.com +conferize.com +connect.me +coursera.org +dailymotion.com +del.icio.us +delicious.com +deviantart.com +diasp.org +digg.com +disqus.com +dopplr.com +dribbble.com +drupal.org +eat.ly +edison.com +elgg.org +ello.co +etsy.com +evernote.com +example.com +facebook.com +fb.com +feedburner.com +ffffound.com +findings.com +flattr.com +flickr.com +flipboard.com +foursquare.com +friendfeed.com +friendster.com +gab.com +getglue.com +getsatisfaction.com +genius.com +ggpht.com +github.com +gitorious.org +gmail.com +gnolia.com +godudu.com +goodreads.com +google.com +gowalla.com +gu.com +guardian.co.uk +gumroad.com +here.com +hootsuite.com +huffduffer.com +hulu.com +hyves.nl +identi.ca +igraal.com +ink361.com +instagr.am +instagram.com +intensedebate.com +ipersonic.de +iwantmyname.com +jaiku.com +jamendo.com +joindiaspora.com +keybase.io +khanacademy.org +klout.com +kmworld.com +lanyrd.com +last.fm +lastfm.de +linkd.in +linkedin.com +live.com +livestream.com +mastodon.cloud +mastodon.social +mastodon.technology +medium.com +meetin.gs +meetup.com +mixcloud.com +mixx.com +mstdn.io +mstdn.jp +myopenid.com +myspace.com +myvideo.de +news.ycombinator.com +openstreetmap.org +orkut.com +pandora.com +paper.li +patreon.com +pawoo.net +personalinfocloud.com +photobucket.com +photoshop.com +pinboard.in +pinterest.com +plancast.com +plaxo.com +plazes.com +plinky.com +plurk.com +podhost.de +podspot.de +pontapreta.net +pownce.com +prezi.com +qik.com +quora.com +qype.com +raptr.com +rdio.com +readernaut.com +readlists.com +readmill.com +rebelmouse.com +reddit.com +researchgate.net +rhapsody.com +scribd.com +secondlife.com +seesmic.com +shelfari.com +shutterfly.com +skitch.com +slideshare.net +snookerblog.de +so.cl +socialmedian.com +soundcloud.com +speakerdeck.com +spotify.com +stackexchange.com +stackoverflow.com +steamcommunity.com +striking.ly +strikingly.com +stumbleupon.com +switter.at +technorati.com +tent.is +theonion.com +thesession.org +tinyletter.com +tribe.net +tripit.com +tweakers.net +twit.tv +twitch.com +twitch.tv +twitter.com +twittercounter.com +typepad.com +ubuntu.com +userscripts.org +ustream.tv +v.gd +vanderwal.net +viadeo.com +viddler.com +vimeo.com +vodspot.tv +wikipedia.org +xda-developers.com +xfire.com +xing.com +yahoo.com +ycombinator.com +yelp.com +yelp.de +youtu.be +youtube.com +wordpress.org + +# URL shortener domains. Gratefully stolen from http://longurl.org/services +# See also: http://uribl.com/, https://github.com/piwik/referrer-spam-blacklist +0rz.tw +1link.in +1url.com +2.gp +2big.at +2tu.us +3.ly +307.to +4ms.me +4sq.com +4url.cc +6url.com +7.ly +a.gg +a.nf +aa.cx +abcurl.net +ad.vu +adf.ly +adjix.com +afx.cc +all.fuseurl.com +alturl.com +amzn.to +ar.gy +arst.ch +atu.ca +azc.cc +b23.ru +b2l.me +bacn.me +bcool.bz +binged.it +bit.ly +bizj.us +bloat.me +bravo.ly +bsa.ly +budurl.com +canurl.com +chilp.it +chzb.gr +cl.lk +cl.ly +clck.ru +cli.gs +cliccami.info +clickthru.ca +clop.in +conta.cc +cort.as +cot.ag +crks.me +ctvr.us +cutt.us +dai.ly +decenturl.com +dfl8.me +digbig.com +digg.com +disq.us +dld.bz +dlvr.it +do.my +doiop.com +dopen.us +easyuri.com +easyurl.net +eepurl.com +eweri.com +fa.by +fav.me +fb.me +fbshare.me +ff.im +fff.to +fire.to +firsturl.de +firsturl.net +flic.kr +flq.us +fly2.ws +fon.gs +freak.to +fuseurl.com +fuzzy.to +fwd4.me +fwib.net +g.ro.lt +gizmo.do +gl.am +go.9nl.com +go.ign.com +go.usa.gov +goo.gl +goshrink.com +gurl.es +hex.io +hiderefer.com +hmm.ph +href.in +hsblinks.com +htxt.it +huff.to +hulu.com +hurl.me +hurl.ws +icanhaz.com +idek.net +ilix.in +is.gd +its.my +ix.lt +j.mp + +jijr.com +kl.am +klck.me +korta.nu +krunchd.com +l9k.net +lat.ms +liip.to +liltext.com +linkbee.com +linkbun.ch +liurl.cn +ln-s.net +ln-s.ru +lnk.gd +lnk.ms +lnkd.in +lnkurl.com +lru.jp +lt.tl +lurl.no +macte.ch +mash.to +merky.de +migre.me +miniurl.com +minurl.fr +mke.me +moby.to +moourl.com +mrte.ch +myloc.me +myurl.in +n.pr +nbc.co +nblo.gs +nn.nf +not.my +notlong.com +nsfw.in +nutshellurl.com +nxy.in +nyti.ms +o-x.fr +oc1.us +om.ly +omf.gd +omoikane.net +on.cnn.com +on.mktw.net +onforb.es +orz.se +ow.ly +ping.fm +pli.gs +pnt.me +politi.co +post.ly +pp.gg +profile.to +ptiturl.com +pub.vitrue.com +qlnk.net +qte.me +qu.tc +qy.fi +r.im +rb6.me +read.bi +readthis.ca +reallytinyurl.com +redir.ec +redirects.ca +redirx.com +retwt.me +ri.ms +rickroll.it +riz.gd +rt.nu +ru.ly +rubyurl.com +rurl.org +rww.tw +s4c.in +s7y.us +safe.mn +sameurl.com +sdut.us +shar.es +shink.de +shorl.com +short.ie +short.to +shortlinks.co.uk +shorturl.com +shout.to +show.my +shrinkify.com +shrinkr.com +shrt.fr +shrt.st +shrten.com +shrunkin.com +simurl.com +slate.me +smallr.com +smsh.me +smurl.name +sn.im +snipr.com +snipurl.com +snurl.com +sp2.ro +spedr.com + +srnk.net +srs.li +starturl.com +su.pr +surl.co.uk +surl.hu +t.cn +t.co +t.lh.com +ta.gd +tbd.ly +tcrn.ch +tgr.me +tgr.ph +tighturl.com +tiniuri.com +tiny.cc +tiny.ly +tiny.pl +tinylink.in +tinyuri.ca +tinyurl.com +tk. +tl.gd +tmi.me +tnij.org +tnw.to +tny.com +to. +to.ly +togoto.us +totc.us +toysr.us +tpm.ly +tr.im +tra.kz +trunc.it +twhub.com +twirl.at +twitclicks.com +twitterurl.net +twitterurl.org +twiturl.de +twurl.cc +twurl.nl +u.mavrev.com +u.nu +u76.org +ub0.cc +ulu.lu +updating.me +ur1.ca +url.az +url.co.uk +url.ie +url360.me +url4.eu +urlborg.com +urlbrief.com +urlcover.com +urlcut.com +urlenco.de +urli.nl +urls.im +urlshorteningservicefortwitter.com +urlx.ie +urlzen.com +usat.ly +use.my +vb.ly +vgn.am +vl.am +vm.lc +w55.de +wapo.st +wapurl.co.uk +wipi.es +wp.me +x.vu +xr.com +xrl.in +xrl.us +xurl.es +xurl.jp +y.ahoo.it +yatuc.com +ye.pe +yep.it +yfrog.com +yhoo.it +yiyd.com +youtu.be +yuarel.com +z0p.de +zi.ma +zi.mu +zipmyurl.com +zud.me +zurl.ws +zz.gd +zzang.kr +›.ws +✩.ws +✿.ws +❥.ws +➔.ws +➞.ws +➡.ws +➨.ws +➯.ws +➹.ws +➽.ws + +# top 500 web sites by incoming links by domain, as of jan 2014 +# gratefully stolen from https://moz.com/top500 +facebook.com +twitter.com +google.com +youtube.com +wordpress.org +adobe.com +# blogspot.com +wikipedia.org +linkedin.com +# wordpress.com +yahoo.com +amazon.com +flickr.com +pinterest.com +# tumblr.com +w3.org +apple.com +myspace.com +vimeo.com +microsoft.com +youtu.be +qq.com +digg.com +baidu.com +stumbleupon.com +addthis.com +statcounter.com +feedburner.com +miibeian.gov.cn +delicious.com +nytimes.com +reddit.com +weebly.com +bbc.co.uk +blogger.com +msn.com +macromedia.com +goo.gl +instagram.com +gov.uk +icio.us +yandex.ru +cnn.com +webs.com +google.de +t.co +livejournal.com +imdb.com +mail.ru +jimdo.com +sourceforge.net +go.com +tinyurl.com +vk.com +google.co.jp +fc2.com +free.fr +joomla.org +creativecommons.org +typepad.com +networkadvertising.org +technorati.com +sina.com.cn +hugedomains.com +about.com +theguardian.com +yahoo.co.jp +nih.gov +huffingtonpost.com +google.co.uk +mozilla.org +51.la +aol.com +ebay.com +ameblo.jp +wsj.com +europa.eu +taobao.com +bing.com +rambler.ru +guardian.co.uk +tripod.com +godaddy.com +issuu.com +gnu.org +geocities.com +slideshare.net +wix.com +mapquest.com +washingtonpost.com +homestead.com +reuters.com +163.com +photobucket.com +forbes.com +clickbank.net +weibo.com +etsy.com +amazon.co.uk +dailymotion.com +soundcloud.com +usatoday.com +yelp.com +cnet.com +posterous.com +telegraph.co.uk +archive.org +google.fr +constantcontact.com +phoca.cz +phpbb.com +latimes.com +e-recht24.de +rakuten.co.jp +amazon.de +opera.com +miitbeian.gov.cn +php.net +scribd.com +bbb.org +parallels.com +ning.com +dailymail.co.uk +cdc.gov +sohu.com +wikimedia.org +deviantart.com +# mit.edu +sakura.ne.jp +altervista.org +addtoany.com +time.com +google.it +# stanford.edu +live.com +alibaba.com +squidoo.com +# harvard.edu +gravatar.com +histats.com +nasa.gov +npr.org +ca.gov +eventbrite.com +wired.com +amazon.co.jp +nbcnews.com +# blog.com +amazonaws.com +bloomberg.com +narod.ru +blinklist.com +imageshack.us +kickstarter.com +hatena.ne.jp +nifty.com +angelfire.com +google.es +ocn.ne.jp +over-blog.com +dedecms.com +google.ca +a8.net +weather.com +pbs.org +ibm.com +cpanel.net +prweb.com +bandcamp.com +barnesandnoble.com +mozilla.com +noaa.gov +goo.ne.jp +comsenz.com +xrea.com +cbsnews.com +foxnews.com +discuz.net +eepurl.com +businessweek.com +# berkeley.edu +newsvine.com +bluehost.com +geocities.jp +loc.gov +yolasite.com +apache.org +mashable.com +usda.gov +nationalgeographic.com +whitehouse.gov +tripadvisor.com +ted.com +sfgate.com +biglobe.ne.jp +epa.gov +vkontakte.ru +oracle.com +seesaa.net +examiner.com +# cornell.edu +hp.com +nps.gov +disqus.com +alexa.com +mysql.com +house.gov +sphinn.com +boston.com +techcrunch.com +un.org +# squarespace.com +icq.com +freewebs.com +ezinearticles.com +ucoz.ru +independent.co.uk +mediafire.com +xinhuanet.com +google.nl +reverbnation.com +imgur.com +irs.gov +webnode.com +wunderground.com +bizjournals.com +who.int +soup.io +cloudflare.com +people.com.cn +ustream.tv +senate.gov +cbslocal.com +ycombinator.com +opensource.org +spiegel.de +oaic.gov.au +nature.com +businessinsider.com +drupal.org +last.fm +privacy.gov.au +skype.com +wikia.com +about.me +webmd.com +youku.com +gmpg.org +fda.gov +redcross.org +github.com +cbc.ca +# umich.edu +jugem.jp +shinystat.com +google.com.br +ifeng.com +mac.com +wiley.com +discovery.com +topsy.com +paypal.com +google.cn +surveymonkey.com +moonfruit.com +dropbox.com +exblog.jp +google.pl +prnewswire.com +ft.com +uol.com.br +behance.net +goodreads.com +netvibes.com +auda.org.au +marketwatch.com +ed.gov +networksolutions.com +state.gov +sitemeter.com +liveinternet.ru +ftc.gov +census.gov +quantcast.com +economist.com +nydailynews.com +zdnet.com +cafepress.com +ow.ly +meetup.com +netscape.com +chicagotribune.com +theatlantic.com +google.com.au +1688.com +skyrock.com +list-manage.com +pagesperso-orange.fr +cdbaby.com +friendfeed.com +ehow.com +patch.com +# upenn.edu +engadget.com +diigo.com +com.com +slashdot.org +# washington.edu +# columbia.edu +nhs.uk +abc.net.au +elegantthemes.com +# utexas.edu +# yale.edu +marriott.com +bigcartel.com +# ucla.edu +usgs.gov +jigsy.com +hexun.com +hubpages.com +slate.com +purevolume.com +# umn.edu +bloglines.com +so-net.ne.jp +wikispaces.com +cargocollective.com +howstuffworks.com +plala.or.jp +infoseek.co.jp +jiathis.com +usnews.com +xing.com +flavors.me +desdev.cn +hc360.com +usa.gov +edublogs.org +lycos.com +# wisc.edu +thetimes.co.uk +state.tx.us +example.com +shareasale.com +biblegateway.com +is.gd +yellowbook.com +samsung.com +businesswire.com +g.co +dion.ne.jp +dagondesign.com +theglobeandmail.com +booking.com +storify.com +salon.com +ucoz.com +gizmodo.com +# psu.edu +smh.com.au +reference.com +sun.com +unicef.org +devhub.com +artisteer.com +unesco.org +istockphoto.com +answers.com +trellian.com +cocolog-nifty.com +i2i.jp +t-online.de +intel.com +1und1.de +ebay.co.uk +sciencedaily.com +paginegialle.it +ask.com +springer.com +canalblog.com +timesonline.co.uk +de.vu +deliciousdays.com +smugmug.com +wufoo.com +globo.com +# cmu.edu +domainmarket.com +odnoklassniki.ru +twitpic.com +ovh.net +home.pl +naver.com +google.ru +# si.edu +newyorker.com +blogs.com +sciencedirect.com +hibu.com +hud.gov +hhs.gov +dmoz.org +dot.gov +cyberchimps.com +google.com.hk +jalbum.net +craigslist.org +zimbio.com +chronoengine.com +cnbc.com +# uiuc.edu +vistaprint.com +symantec.com +prlog.org +360.cn +indiatimes.com +mtv.com +webeden.co.uk +java.com +cisco.com +japanpost.jp +4shared.com +# github.io +mayoclinic.com +studiopress.com +admin.ch +# virginia.edu +printfriendly.com +mlb.com +omniture.com +simplemachines.org +dell.com +accuweather.com +# princeton.edu +fotki.com +comcast.net +chron.com +# nyu.edu +# wp.com +merriam-webster.com +nba.com +shop-pro.jp +lulu.com +furl.net +indiegogo.com +buzzfeed.com +tuttocitta.it +ox.ac.uk +mapy.cz +army.mil +csmonitor.com +bravesites.com +# tamu.edu +rediff.com +toplist.cz +yellowpages.com +va.gov +tiny.cc +netlog.com +elpais.com +oakley.com +multiply.com +tmall.com +hostgator.com +nymag.com +fema.gov +blogtalkradio.com +china.com.cn +unblog.fr +fastcompany.com +# earthlink.net +vinaora.com +# msu.edu +aboutads.info +# ucsd.edu +sogou.com +seattletimes.com +# dyndns.org +123-reg.co.uk +sbwire.com +tinypic.com +acquirethisname.com +shutterfly.com +walmart.com +# pen.io +# arizona.edu +woothemes.com +scientificamerican.com +themeforest.net +spotify.com +cam.ac.uk +# unc.edu +arstechnica.com +hao123.com +# illinois.edu +bloglovin.com +nsw.gov.au +ihg.com +pcworld.com diff --git a/lib/Telegraph/Webmention.php b/lib/Telegraph/Webmention.php index d45d052..d6ae43e 100644 --- a/lib/Telegraph/Webmention.php +++ b/lib/Telegraph/Webmention.php @@ -7,6 +7,8 @@ class Webmention { private static $http = false; + private static $unsupported = null; + // Returns false if the target URL is known to not accept webmentions public static function isProbablySupported($targetURL) { // Reject links that are known to not accept webmentions @@ -14,20 +16,11 @@ class Webmention { if(!$host) return false; - $unsupported = [ - 'twitter.com', - 'instagram.com', - 'facebook.com', - 'meetup.com', - 'eventbrite.com', - 'eventbrite.co.uk', - 'github.com', - 'blog.github.com', - 'gitlab.com', - 't.co', - ]; - - if(in_array($host, $unsupported)) + if(self::$unsupported === null) { + self::$unsupported = explode("\n", file_get_contents(__DIR__.'/../../data/unsupported-domains.txt')); + } + + if(in_array($host, self::$unsupported)) return false; if(preg_match('/.+\.amazonaws\.com/', $host))